wp2txt 0.7.7 → 0.7.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d239913dc9fcda87677ec2eeed1ae51542b9ae7b
4
- data.tar.gz: 6ee520b960dc9bc23a6cf20345cc36a0740d5b96
3
+ metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
4
+ data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
5
5
  SHA512:
6
- metadata.gz: 4520570cf6f4c8c9c955a574523a4222ea3a9e308a86a510c407ac095040f73dc9cf393c49711787709129383080da077cc322a8e0aae9a401d24e9015e5baa8
7
- data.tar.gz: 93f51183f722e6371394350777971f4dc66adf777d4613e547384fe96232071dff1078ff5189841cebebb027a63f072fd09013298a3d6ef042c3f22e25653dc1
6
+ metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
7
+ data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e
data/README.md CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  Wikipedia dump file to text converter
4
4
 
5
+ **Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
6
+
5
7
  ### About ###
6
8
 
7
9
  WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
@@ -26,14 +28,13 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
26
28
 
27
29
  Command line options are as follows:
28
30
 
29
- *CAUTION:* Command line options in the current version have been drastically changed from previous versions.
31
+ **Important** Command line options in the current version have been drastically changed from previous versions.
30
32
 
31
33
  Usage: wp2txt [options]
32
34
  where [options] are:
33
35
  --input-file, -i: Wikipedia dump file with .bz2 (compressed) or
34
36
  .txt (uncompressed) format
35
- --output-dir, -o <s>: Output directory (default:
36
- /Users/yohasebe/Dropbox/code/wp2txt)
37
+ --output-dir, -o <s>: Output directory (default: current directory)
37
38
  --convert, --no-convert, -c: Output in plain text (converting from XML)
38
39
  (default: true)
39
40
  --list, --no-list, -l: Show list items in output (default: true)
@@ -41,14 +42,14 @@ Command line options are as follows:
41
42
  --title, --no-title, -t: Show page titles in output (default: true)
42
43
  --table, -a: Show table source code in output
43
44
  --template, -e: leave inline template notations unmodified
44
- --redirect, -r: Show redirect destination
45
+ --ref, -r: leave reference notations in the format
46
+ [ref]...[/ref]
47
+ --redirect: Show redirect destination
45
48
  --marker, --no-marker, -m: Show symbols prefixed to list items,
46
49
  definitions, etc. (Default: true)
47
50
  --category, -g: Show article category information
48
51
  --file-size, -f <i>: Approximate size (in MB) of each output file
49
52
  (default: 10)
50
- --limit-recur, -u <i>: Max number of recursive call (0 to 10)
51
- (default: 10)
52
53
  --version, -v: Print version and exit
53
54
  --help, -h: Show this message
54
55
 
data/bin/wp2txt CHANGED
@@ -32,32 +32,31 @@ EOS
32
32
  opt :title, "Show page titles in output", :default => true
33
33
  opt :table, "Show table source code in output", :default => false
34
34
  opt :template, "leave inline template notations unmodified", :default => false
35
+ opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
35
36
  opt :redirect, "Show redirect destination", :default => false
36
37
  opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
37
38
  opt :category, "Show article category information", :default => false
38
39
  opt :file_size, "Approximate size (in MB) of each output file", :default => 10
39
- opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10
40
40
  end
41
41
  Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
42
42
  Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
43
- Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10
44
43
 
45
44
  input_file = ARGV[0]
46
45
  output_dir = opts[:output_dir]
47
46
  tfile_size = opts[:file_size]
48
- limit_recur = opts[:limit_recur]
49
47
  convert = opts[:convert]
50
48
  strip_tmarker = opts[:marker] ? false : true
51
49
  opt_array = [:title, :list, :heading, :table, :redirect]
52
50
  $leave_template = true if opts[:template]
53
51
  $leave_table = true if opts[:table]
52
+ $leave_ref = true if opts[:ref]
54
53
  config = {}
55
54
  opt_array.each do |opt|
56
55
  config[opt] = opts[opt]
57
56
  end
58
57
 
59
58
  parent = Wp2txt::CmdProgbar.new
60
- wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker, limit_recur)
59
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
61
60
 
62
61
  wpconv.extract_text do |article|
63
62
  format_wiki!(article.title)
@@ -29,7 +29,7 @@ module Wp2txt
29
29
 
30
30
  include Wp2txt
31
31
 
32
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10)
32
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
33
33
  @parent = parent
34
34
  @fp = nil
35
35
 
@@ -38,9 +38,6 @@ module Wp2txt
38
38
  @tfile_size = tfile_size
39
39
  @convert = convert
40
40
  @strip_tmarker = strip_tmarker
41
-
42
- #max number of recursive calls (global variable)
43
- $limit_recur = limit_recur
44
41
  end
45
42
 
46
43
  def file_size(file)
@@ -134,7 +134,7 @@ module Wp2txt
134
134
 
135
135
  #################### parser for nested structure ####################
136
136
 
137
- def process_nested_structure(scanner, left, right, recur_count, &block)
137
+ def process_nested_structure(scanner, left, right, &block)
138
138
  buffer = ""
139
139
  begin
140
140
  if left == "[" && right == "]"
@@ -168,12 +168,11 @@ module Wp2txt
168
168
  end
169
169
  buffer << scanner.rest
170
170
 
171
- recur_count = recur_count - 1
172
- if recur_count < 0 || buffer == scanner.string
171
+ if buffer == scanner.string
173
172
  return buffer
174
173
  else
175
174
  scanner.string = buffer
176
- return process_nested_structure(scanner, left, right, recur_count, &block) || ""
175
+ return process_nested_structure(scanner, left, right, &block) || ""
177
176
  end
178
177
  rescue => e
179
178
  return scanner.string
@@ -204,7 +203,7 @@ module Wp2txt
204
203
 
205
204
  def process_interwiki_links!(str)
206
205
  scanner = StringScanner.new(str)
207
- result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
206
+ result = process_nested_structure(scanner, "[[", "]]") do |contents|
208
207
  parts = contents.split("|")
209
208
  case parts.size
210
209
  when 1
@@ -219,7 +218,7 @@ module Wp2txt
219
218
 
220
219
  def process_external_links!(str)
221
220
  scanner = StringScanner.new(str)
222
- result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
221
+ result = process_nested_structure(scanner, "[", "]") do |contents|
223
222
  parts = contents.split(" ", 2)
224
223
  case parts.size
225
224
  when 1
@@ -235,7 +234,7 @@ module Wp2txt
235
234
 
236
235
  def remove_templates!(str)
237
236
  scanner = StringScanner.new(str)
238
- result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
237
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
239
238
  ""
240
239
  end
241
240
  str.replace(result)
@@ -243,7 +242,7 @@ module Wp2txt
243
242
 
244
243
  def remove_table!(str)
245
244
  scanner = StringScanner.new(str)
246
- result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
245
+ result = process_nested_structure(scanner, "{|", "|}") do |contents|
247
246
  ""
248
247
  end
249
248
  str.replace(result)
@@ -301,10 +300,11 @@ module Wp2txt
301
300
  end
302
301
 
303
302
  def make_reference!(str)
304
- str.gsub!($make_reference_regex_a, "\n")
305
- str.gsub!($make_reference_regex_b, "")
306
- str.gsub!($make_reference_regex_c, "[ref]")
307
- str.gsub!($make_reference_regex_d, "[/ref]")
303
+ str.gsub!($make_reference_regex_a){"\n"}
304
+ str.gsub!($make_reference_regex_b){""}
305
+ str.gsub!($make_reference_regex_c){"[ref]"}
306
+ str.gsub!($make_reference_regex_d){"[/ref]"}
307
+ str.gsub!($format_ref_regex){""} unless $leave_ref
308
308
  end
309
309
 
310
310
  def format_ref!(page)
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.7"
2
+ VERSION = "0.7.8"
3
3
  end
@@ -6,8 +6,6 @@ require 'wp2txt'
6
6
  require 'wp2txt/article'
7
7
  require 'wp2txt/utils'
8
8
 
9
- $limit_recur = 3
10
-
11
9
  describe "Wp2txt" do
12
10
  it "contains mediawiki-format related functions:" do
13
11
  end
@@ -22,7 +20,7 @@ describe "Wp2txt" do
22
20
  str_before = "[[ab[[cde[[alfa]]]]fg]]"
23
21
  str_after = "<<ab<<cde<<alfa>>>>fg>>"
24
22
  scanner = StringScanner.new(str_before)
25
- str_processed = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |content|
23
+ str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
26
24
  "<<" + content + ">>"
27
25
  end
28
26
  expect(str_processed).to eq str_after
@@ -32,7 +30,7 @@ describe "Wp2txt" do
32
30
  str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
33
31
  |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
34
32
  scanner = StringScanner.new(str_before)
35
- str_processed = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |content|
33
+ str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
36
34
  "<<" + content + ">>"
37
35
  end
38
36
  #str_processed.should == str_after
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.7
4
+ version: 0.7.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe