wp2txt 0.7.7 → 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d239913dc9fcda87677ec2eeed1ae51542b9ae7b
4
- data.tar.gz: 6ee520b960dc9bc23a6cf20345cc36a0740d5b96
3
+ metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
4
+ data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
5
5
  SHA512:
6
- metadata.gz: 4520570cf6f4c8c9c955a574523a4222ea3a9e308a86a510c407ac095040f73dc9cf393c49711787709129383080da077cc322a8e0aae9a401d24e9015e5baa8
7
- data.tar.gz: 93f51183f722e6371394350777971f4dc66adf777d4613e547384fe96232071dff1078ff5189841cebebb027a63f072fd09013298a3d6ef042c3f22e25653dc1
6
+ metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
7
+ data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e
data/README.md CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  Wikipedia dump file to text converter
4
4
 
5
+ **Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
6
+
5
7
  ### About ###
6
8
 
7
9
  WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
@@ -26,14 +28,13 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
26
28
 
27
29
  Command line options are as follows:
28
30
 
29
- *CAUTION:* Command line options in the current version have been drastically changed from previous versions.
31
+ **Important** Command line options in the current version have been drastically changed from previous versions.
30
32
 
31
33
  Usage: wp2txt [options]
32
34
  where [options] are:
33
35
  --input-file, -i: Wikipedia dump file with .bz2 (compressed) or
34
36
  .txt (uncompressed) format
35
- --output-dir, -o <s>: Output directory (default:
36
- /Users/yohasebe/Dropbox/code/wp2txt)
37
+ --output-dir, -o <s>: Output directory (default: current directory)
37
38
  --convert, --no-convert, -c: Output in plain text (converting from XML)
38
39
  (default: true)
39
40
  --list, --no-list, -l: Show list items in output (default: true)
@@ -41,14 +42,14 @@ Command line options are as follows:
41
42
  --title, --no-title, -t: Show page titles in output (default: true)
42
43
  --table, -a: Show table source code in output
43
44
  --template, -e: leave inline template notations unmodified
44
- --redirect, -r: Show redirect destination
45
+ --ref, -r: leave reference notations in the format
46
+ [ref]...[/ref]
47
+ --redirect: Show redirect destination
45
48
  --marker, --no-marker, -m: Show symbols prefixed to list items,
46
49
  definitions, etc. (Default: true)
47
50
  --category, -g: Show article category information
48
51
  --file-size, -f <i>: Approximate size (in MB) of each output file
49
52
  (default: 10)
50
- --limit-recur, -u <i>: Max number of recursive call (0 to 10)
51
- (default: 10)
52
53
  --version, -v: Print version and exit
53
54
  --help, -h: Show this message
54
55
 
data/bin/wp2txt CHANGED
@@ -32,32 +32,31 @@ EOS
32
32
  opt :title, "Show page titles in output", :default => true
33
33
  opt :table, "Show table source code in output", :default => false
34
34
  opt :template, "leave inline template notations unmodified", :default => false
35
+ opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
35
36
  opt :redirect, "Show redirect destination", :default => false
36
37
  opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
37
38
  opt :category, "Show article category information", :default => false
38
39
  opt :file_size, "Approximate size (in MB) of each output file", :default => 10
39
- opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10
40
40
  end
41
41
  Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
42
42
  Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
43
- Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10
44
43
 
45
44
  input_file = ARGV[0]
46
45
  output_dir = opts[:output_dir]
47
46
  tfile_size = opts[:file_size]
48
- limit_recur = opts[:limit_recur]
49
47
  convert = opts[:convert]
50
48
  strip_tmarker = opts[:marker] ? false : true
51
49
  opt_array = [:title, :list, :heading, :table, :redirect]
52
50
  $leave_template = true if opts[:template]
53
51
  $leave_table = true if opts[:table]
52
+ $leave_ref = true if opts[:ref]
54
53
  config = {}
55
54
  opt_array.each do |opt|
56
55
  config[opt] = opts[opt]
57
56
  end
58
57
 
59
58
  parent = Wp2txt::CmdProgbar.new
60
- wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker, limit_recur)
59
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
61
60
 
62
61
  wpconv.extract_text do |article|
63
62
  format_wiki!(article.title)
@@ -29,7 +29,7 @@ module Wp2txt
29
29
 
30
30
  include Wp2txt
31
31
 
32
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10)
32
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
33
33
  @parent = parent
34
34
  @fp = nil
35
35
 
@@ -38,9 +38,6 @@ module Wp2txt
38
38
  @tfile_size = tfile_size
39
39
  @convert = convert
40
40
  @strip_tmarker = strip_tmarker
41
-
42
- #max number of recursive calls (global variable)
43
- $limit_recur = limit_recur
44
41
  end
45
42
 
46
43
  def file_size(file)
@@ -134,7 +134,7 @@ module Wp2txt
134
134
 
135
135
  #################### parser for nested structure ####################
136
136
 
137
- def process_nested_structure(scanner, left, right, recur_count, &block)
137
+ def process_nested_structure(scanner, left, right, &block)
138
138
  buffer = ""
139
139
  begin
140
140
  if left == "[" && right == "]"
@@ -168,12 +168,11 @@ module Wp2txt
168
168
  end
169
169
  buffer << scanner.rest
170
170
 
171
- recur_count = recur_count - 1
172
- if recur_count < 0 || buffer == scanner.string
171
+ if buffer == scanner.string
173
172
  return buffer
174
173
  else
175
174
  scanner.string = buffer
176
- return process_nested_structure(scanner, left, right, recur_count, &block) || ""
175
+ return process_nested_structure(scanner, left, right, &block) || ""
177
176
  end
178
177
  rescue => e
179
178
  return scanner.string
@@ -204,7 +203,7 @@ module Wp2txt
204
203
 
205
204
  def process_interwiki_links!(str)
206
205
  scanner = StringScanner.new(str)
207
- result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
206
+ result = process_nested_structure(scanner, "[[", "]]") do |contents|
208
207
  parts = contents.split("|")
209
208
  case parts.size
210
209
  when 1
@@ -219,7 +218,7 @@ module Wp2txt
219
218
 
220
219
  def process_external_links!(str)
221
220
  scanner = StringScanner.new(str)
222
- result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
221
+ result = process_nested_structure(scanner, "[", "]") do |contents|
223
222
  parts = contents.split(" ", 2)
224
223
  case parts.size
225
224
  when 1
@@ -235,7 +234,7 @@ module Wp2txt
235
234
 
236
235
  def remove_templates!(str)
237
236
  scanner = StringScanner.new(str)
238
- result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
237
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
239
238
  ""
240
239
  end
241
240
  str.replace(result)
@@ -243,7 +242,7 @@ module Wp2txt
243
242
 
244
243
  def remove_table!(str)
245
244
  scanner = StringScanner.new(str)
246
- result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
245
+ result = process_nested_structure(scanner, "{|", "|}") do |contents|
247
246
  ""
248
247
  end
249
248
  str.replace(result)
@@ -301,10 +300,11 @@ module Wp2txt
301
300
  end
302
301
 
303
302
  def make_reference!(str)
304
- str.gsub!($make_reference_regex_a, "\n")
305
- str.gsub!($make_reference_regex_b, "")
306
- str.gsub!($make_reference_regex_c, "[ref]")
307
- str.gsub!($make_reference_regex_d, "[/ref]")
303
+ str.gsub!($make_reference_regex_a){"\n"}
304
+ str.gsub!($make_reference_regex_b){""}
305
+ str.gsub!($make_reference_regex_c){"[ref]"}
306
+ str.gsub!($make_reference_regex_d){"[/ref]"}
307
+ str.gsub!($format_ref_regex){""} unless $leave_ref
308
308
  end
309
309
 
310
310
  def format_ref!(page)
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.7"
2
+ VERSION = "0.7.8"
3
3
  end
@@ -6,8 +6,6 @@ require 'wp2txt'
6
6
  require 'wp2txt/article'
7
7
  require 'wp2txt/utils'
8
8
 
9
- $limit_recur = 3
10
-
11
9
  describe "Wp2txt" do
12
10
  it "contains mediawiki-format related functions:" do
13
11
  end
@@ -22,7 +20,7 @@ describe "Wp2txt" do
22
20
  str_before = "[[ab[[cde[[alfa]]]]fg]]"
23
21
  str_after = "<<ab<<cde<<alfa>>>>fg>>"
24
22
  scanner = StringScanner.new(str_before)
25
- str_processed = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |content|
23
+ str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
26
24
  "<<" + content + ">>"
27
25
  end
28
26
  expect(str_processed).to eq str_after
@@ -32,7 +30,7 @@ describe "Wp2txt" do
32
30
  str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
33
31
  |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
34
32
  scanner = StringScanner.new(str_before)
35
- str_processed = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |content|
33
+ str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
36
34
  "<<" + content + ">>"
37
35
  end
38
36
  #str_processed.should == str_after
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.7
4
+ version: 0.7.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe