wp2txt 0.7.8 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
File without changes
@@ -37,6 +37,10 @@ module Wp2txt
37
37
  def initialize(text, title = "", strip_tmarker = false)
38
38
  @title = title.strip
39
39
  @strip_tmarker = strip_tmarker
40
+ convert_characters!(text)
41
+ make_reference!(text)
42
+ remove_ref!(text)
43
+
40
44
  parse text
41
45
  end
42
46
 
@@ -58,6 +62,22 @@ module Wp2txt
58
62
  end
59
63
 
60
64
  case mode
65
+ when :mw_ml_template
66
+ scanner = StringScanner.new(line)
67
+ str= process_nested_structure(scanner, "{{", "}}") {""}
68
+ if $ml_template_end_regex =~ str
69
+ mode = nil
70
+ end
71
+ @elements.last.last << line
72
+ next
73
+ when :mw_ml_link
74
+ scanner = StringScanner.new(line)
75
+ str= process_nested_structure(scanner, "[[", "]]") {""}
76
+ if $ml_link_end_regex =~ str
77
+ mode = nil
78
+ end
79
+ @elements.last.last << line
80
+ next
61
81
  when :mw_table
62
82
  if $in_table_regex2 =~ line
63
83
  mode = nil
@@ -91,19 +111,29 @@ module Wp2txt
91
111
  end
92
112
 
93
113
  case line
114
+ when $isolated_template_regex
115
+ @elements << create_element(:mw_isolated_template, line)
116
+ when $isolated_tag_regex
117
+ @elements << create_element(:mw_isolated_tag, line)
94
118
  when $blank_line_regex
95
119
  @elements << create_element(:mw_blank, "\n")
96
120
  when $redirect_regex
97
121
  @elements << create_element(:mw_redirect, line)
98
- when $in_template_regex
99
- @elements << create_element(:mw_template, line)
122
+ # when $in_template_regex
123
+ # @elements << create_element(:mw_template, line)
100
124
  when $in_heading_regex
101
125
  line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
102
126
  @elements << create_element(:mw_heading, "\n" + line + "\n")
103
127
  when $in_inputbox_regex
104
128
  @elements << create_element(:mw_inputbox, line)
129
+ when $ml_template_onset_regex
130
+ @elements << create_element(:mw_ml_template, line)
131
+ mode = :mw_ml_template
132
+ when $ml_link_onset_regex
133
+ @elements << create_element(:mw_ml_link, line)
134
+ mode = :mw_ml_link
105
135
  when $in_inputbox_regex1
106
- mode = :mw_inputbox
136
+ mode = :mw_inputbox
107
137
  @elements << create_element(:mw_inputbox, line)
108
138
  when $in_source_regex
109
139
  @elements << create_element(:mw_source, line)
@@ -138,7 +168,7 @@ module Wp2txt
138
168
  when $in_link_regex
139
169
  @elements << create_element(:mw_link, line)
140
170
  else
141
- @elements << create_element(:mw_paragraph, line)
171
+ @elements << create_element(:mw_paragraph, "\n" + line)
142
172
  end
143
173
  end
144
174
  @elements
data/lib/wp2txt/utils.rb CHANGED
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
16
16
  $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
17
  $html_hash = Hash[*$entities.flatten]
18
18
  $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
-
20
- $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
19
+ $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
+ $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
+ $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
+ $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
+ $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
+ $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
21
25
  $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
22
-
23
26
  $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
24
27
  $in_inputbox_regex1 = Regexp.new('<inputbox>')
25
28
  $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
26
-
27
29
  $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
28
30
  $in_source_regex1 = Regexp.new('<source.*?>')
29
31
  $in_source_regex2 = Regexp.new('<\/source>')
30
-
31
32
  $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
32
33
  $in_math_regex1 = Regexp.new('<math.*?>')
33
34
  $in_math_regex2 = Regexp.new('<\/math>')
34
-
35
35
  $in_heading_regex = Regexp.new('^=+.*?=+$')
36
-
37
36
  $in_html_table_regex = Regexp.new('<table.*?><\/table>')
38
37
  $in_html_table_regex1 = Regexp.new('<table\b')
39
38
  $in_html_table_regex2 = Regexp.new('<\/\s*table>')
40
-
41
39
  $in_table_regex1 = Regexp.new('^\s*\{\|')
42
40
  $in_table_regex2 = Regexp.new('^\|\}.*?$')
43
-
44
41
  $in_unordered_regex = Regexp.new('^\*')
45
42
  $in_ordered_regex = Regexp.new('^\#')
46
43
  $in_pre_regex = Regexp.new('^ ')
47
44
  $in_definition_regex = Regexp.new('^[\;\:]')
48
-
49
45
  $blank_line_regex = Regexp.new('^\s*$')
50
-
51
46
  $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
52
-
53
47
  $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
54
48
  $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
55
-
56
49
  $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
57
50
  $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
58
51
  $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
75
68
  $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
76
69
  $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
77
70
 
71
+ $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
78
72
  $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
79
73
  $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
80
74
 
@@ -111,27 +105,26 @@ module Wp2txt
111
105
 
112
106
  def format_wiki!(text, has_retried = false)
113
107
  escape_nowiki!(text)
114
-
115
108
  process_interwiki_links!(text)
116
109
  process_external_links!(text)
117
-
118
110
  unescape_nowiki!(text)
119
- end
120
-
121
- def format_article!(text)
111
+ #####
122
112
  remove_directive!(text)
123
113
  remove_emphasis!(text)
124
114
  mndash!(text)
125
- make_reference!(text)
126
- format_ref!(text)
127
115
  remove_hr!(text)
128
116
  remove_tag!(text)
129
- convert_characters!(text)
130
- correct_inline_template!(text) unless $leave_template
131
- remove_templates!(text) unless $leave_template
132
- remove_table!(text) unless $leave_table
117
+ correct_inline_template!(text) unless $leave_inline_template
118
+ remove_templates!(text) unless $leave_inline_template
119
+ # remove_table!(text) unless $leave_table
133
120
  end
134
121
 
122
+ def cleanup!(text)
123
+ text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
+ text.gsub!(/\n\n\n+/m){"\n\n"}
125
+ text.strip!
126
+ text << "\n\n"
127
+ end
135
128
  #################### parser for nested structure ####################
136
129
 
137
130
  def process_nested_structure(scanner, left, right, &block)
@@ -237,6 +230,10 @@ module Wp2txt
237
230
  result = process_nested_structure(scanner, "{{", "}}") do |contents|
238
231
  ""
239
232
  end
233
+ scanner = StringScanner.new(result)
234
+ result = process_nested_structure(scanner, "{", "}") do |contents|
235
+ ""
236
+ end
240
237
  str.replace(result)
241
238
  end
242
239
 
@@ -295,48 +292,48 @@ module Wp2txt
295
292
  str.gsub!($mndash_regex, "–")
296
293
  end
297
294
 
298
- def remove_hr!(page)
299
- page.gsub!($remove_hr_regex, "")
295
+ def remove_hr!(str)
296
+ str.gsub!($remove_hr_regex, "")
300
297
  end
301
298
 
299
+ def remove_ref!(str)
300
+ str.gsub!($format_ref_regex){""}
301
+ end
302
+
302
303
  def make_reference!(str)
303
304
  str.gsub!($make_reference_regex_a){"\n"}
304
305
  str.gsub!($make_reference_regex_b){""}
305
306
  str.gsub!($make_reference_regex_c){"[ref]"}
306
307
  str.gsub!($make_reference_regex_d){"[/ref]"}
307
- str.gsub!($format_ref_regex){""} unless $leave_ref
308
- end
309
-
310
- def format_ref!(page)
311
- ###### do nothing for now
312
- # page.gsub!($format_ref_regex) do
313
- # end
314
308
  end
315
309
 
316
310
  def correct_inline_template!(str)
317
- str.gsub!($remove_inline_regex) do
318
- key = $1
319
- if $onset_bar_regex =~ key
320
- result = key
321
- elsif
322
- info = key.split("|")
323
- type_code = info.first
324
- case type_code
325
- when $type_code_regex
326
- out = info[-1]
327
- else
328
- if $leave_template
329
- out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
330
- else
331
- out = ""
311
+ scanner = StringScanner.new(str)
312
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
313
+ parts = contents.split("|")
314
+ # type_code = parts.first
315
+ # case type_code
316
+ # when $type_code_regex
317
+ # out = parts[-1]
318
+ # else
319
+ # case parts.size
320
+ # when 0
321
+ # out = ""
322
+ # when 1
323
+ # out = parts.first || ""
324
+ # else
325
+ # while parts.size > 2 && parts.last.split("=").size > 1
326
+ while parts.size > 1 && parts.last.split("=").size > 1
327
+ parts.pop
332
328
  end
333
- end
334
- out
335
- else
336
- ""
337
- end
329
+ out = parts.last || ""
330
+ # end
331
+ # end
332
+ out.strip
338
333
  end
334
+ str.replace result
339
335
  end
336
+
340
337
 
341
338
  #################### file related utilities ####################
342
339
 
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.8"
2
+ VERSION = "0.9.2"
3
3
  end
data/lib/wp2txt.rb CHANGED
@@ -3,13 +3,10 @@
3
3
 
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
- # require "rubygems"
7
- # require "bundler/setup"
8
-
9
6
  require "nokogiri"
10
- # require "oga"
11
- # require "ox"
7
+ require "parallel"
12
8
 
9
+ require 'etc'
13
10
  require 'pp'
14
11
  require "wp2txt/article"
15
12
  require "wp2txt/utils"
@@ -29,7 +26,7 @@ module Wp2txt
29
26
 
30
27
  include Wp2txt
31
28
 
32
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
29
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
33
30
  @parent = parent
34
31
  @fp = nil
35
32
 
@@ -38,6 +35,8 @@ module Wp2txt
38
35
  @tfile_size = tfile_size
39
36
  @convert = convert
40
37
  @strip_tmarker = strip_tmarker
38
+ num_cores_available = Etc.nprocessors
39
+ @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
41
40
  end
42
41
 
43
42
  def file_size(file)
@@ -102,6 +101,7 @@ module Wp2txt
102
101
  if /.bz2$/ =~ @input_file
103
102
  unless NO_BZ2
104
103
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
104
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
105
105
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
106
106
  @infile_size = file_size(file)
107
107
  @parent.msg("... Done.", 1)
@@ -113,6 +113,7 @@ module Wp2txt
113
113
  else
114
114
  file = IO.popen("bzip2 -c -d #{@input_file}")
115
115
  end
116
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
116
117
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
117
118
  @infile_size = file_size(file)
118
119
  @parent.msg("... Done.", 1)
@@ -237,81 +238,73 @@ module Wp2txt
237
238
  end_flag = false
238
239
  terminal_round = false
239
240
  output_text = ""
241
+ pages = []
242
+ data_empty = false
240
243
 
241
- while page = get_page
242
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
243
- xml = xmlns + page + "</mediawiki>"
244
-
245
- input = Nokogiri::XML(xml, nil, 'UTF-8')
246
- page = input.xpath("//xmlns:text").first
247
- pp_title = page.parent.parent.at_css "title"
248
- title = pp_title.content
249
- next if /\:/ =~ title
250
- text = page.content
251
-
252
- # input = Oga.parse_xml(xml)
253
- # page = input.xpath("//xmlns:text").first
254
- # title = page.parent.parent.xpath("//xmlns:title").first.text
255
- # next if /\:/ =~ title
256
- # text = page.text
244
+ begin
245
+ page = get_page
246
+ if page
247
+ pages << page
248
+ else
249
+ data_empty = true
250
+ end
251
+ if data_empty || pages.size == @num_threads
252
+ # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
+ pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
254
+ page_text = {:order => n, :data => nil}
255
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
256
+ xml = xmlns + page + "</mediawiki>"
257
257
 
258
- # input = Ox.load(xml, :encoding => "UTF-8")
259
- # title = ""
260
- # text = ""
261
- # input.nodes.first.nodes.each do |n|
262
- # if n.name == "title"
263
- # title = n.nodes.first
264
- # if /\:/ =~ title
265
- # title = ""
266
- # break
267
- # end
268
- # elsif n.name == "revision"
269
- # n.nodes.each do |o|
270
- # if o.name == "text"
271
- # text = o.nodes.first
272
- # break
273
- # end
274
- # end
275
- # end
276
- # end
277
- # next if title == "" || text == ""
258
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
259
+ page = input.xpath("//xmlns:text").first
260
+ pp_title = page.parent.parent.at_css "title"
261
+ title = pp_title.content
262
+ unless /\:/ =~ title
263
+ text = page.content
264
+ text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
265
+ num_of_newlines = content.count("\n")
266
+ if num_of_newlines == 0
267
+ ""
268
+ else
269
+ "\n" * num_of_newlines
270
+ end
271
+ end
272
+ article = Article.new(text, title, @strip_tmarker)
273
+ page_text[:data] = block.call(article)
274
+ end
275
+ page_text
276
+ end
277
+ pages.clear
278
+ pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
279
+ pages_text.each do |page_text|
280
+ output_text << page_text
281
+ @count ||= 0; @count += 1;
282
+ @total_size = output_text.bytesize
283
+ # flagged when data exceeds the size of output file
284
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
285
+ end
278
286
 
279
- # remove all comment texts
280
- # and insert as many number of new line chars included in
281
- # each comment instead
282
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
283
- num_of_newlines = content.count("\n")
284
- if num_of_newlines == 0
285
- ""
286
- else
287
- "\n" * num_of_newlines
287
+ #close the present file, then open a new one
288
+ if end_flag
289
+ cleanup!(output_text)
290
+ @fp.puts(output_text)
291
+ output_text = ""
292
+ @total_size = 0
293
+ end_flag = false
294
+ @fp.close
295
+ @file_index += 1
296
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
297
+ @outfiles << outfilename
298
+ @fp = File.open(outfilename, "w")
299
+ next
288
300
  end
289
301
  end
290
-
291
- @count ||= 0;@count += 1;
292
-
293
- article = Article.new(text, title, @strip_tmarker)
294
- output_text += block.call(article)
295
- @total_size = output_text.bytesize
302
+ end while !data_empty
296
303
 
297
- # flagged when data exceeds the size of output file
298
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
299
-
300
- #close the present file, then open a new one
301
- if end_flag
302
- @fp.puts(output_text)
303
- output_text = ""
304
- @total_size = 0
305
- end_flag = false
306
- @fp.close
307
- @file_index += 1
308
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
309
- @outfiles << outfilename
310
- @fp = File.open(outfilename, "w")
311
- next
312
- end
304
+ if output_text != ""
305
+ cleanup!(output_text)
306
+ @fp.puts(output_text)
313
307
  end
314
- @fp.puts(output_text) if output_text != ""
315
308
  notify_parent(true)
316
309
  @parent.after
317
310
  @fp.close
@@ -351,4 +344,5 @@ module Wp2txt
351
344
  @parent.msg("Processing finished", 1)
352
345
  end
353
346
  end
354
- end
347
+ end
348
+
data/spec/utils_spec.rb CHANGED
@@ -182,22 +182,34 @@ describe "Wp2txt" do
182
182
  end
183
183
  end
184
184
 
185
- # describe "process_template" do
186
- # it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # expect(process_template(str_before)).to eq str_after
190
- # str_before = "{{lang|en|Japan}}"
191
- # str_after = "Japan"
192
- # expect(process_template(str_before)).to eq str_after
193
- # str_before = "{{a|b=c|d=f}}"
194
- # str_after = "a"
195
- # expect(process_template(str_before)).to eq str_after
196
- # str_before = "{{a|b|{{c|d|e}}}}"
197
- # str_after = "e"
198
- # expect(process_template(str_before)).to eq str_after
199
- # end
200
- # end
185
+ describe "correct_inline_template!" do
186
+ it "removes brackets and leaving some text" do
187
+ # str_before = "{{}}"
188
+ # str_after = ""
189
+ # correct_inline_template!(str_before)
190
+ # expect(str_before).to eq str_after
191
+ str_before = "{{MedalCountry | {{JPN}} }}"
192
+ str_after = "JPN"
193
+ correct_inline_template!(str_before)
194
+ expect(str_before).to eq str_after
195
+ str_before = "{{lang|en|Japan}}"
196
+ str_after = "Japan"
197
+ correct_inline_template!(str_before)
198
+ expect(str_before).to eq str_after
199
+ str_before = "{{a|b=c|d=f}}"
200
+ str_after = "a"
201
+ correct_inline_template!(str_before)
202
+ expect(str_before).to eq str_after
203
+ str_before = "{{a|b|{{c|d|e}}}}"
204
+ str_after = "e"
205
+ correct_inline_template!(str_before)
206
+ expect(str_before).to eq str_after
207
+ str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
208
+ str_after = "日本人に多く見受けられる"
209
+ correct_inline_template!(str_before)
210
+ expect(str_before).to eq str_after
211
+ end
212
+ end
201
213
 
202
214
  # describe "expand_template" do
203
215
  # it "gets data corresponding to a given template using mediawiki api" do
data/wp2txt.gemspec CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
23
23
  # s.add_development_dependency "rake"
24
24
 
25
25
  s.add_dependency "nokogiri"
26
+ s.add_dependency "parallel"
26
27
  s.add_dependency "htmlentities"
27
- s.add_dependency "trollop"
28
+ s.add_dependency "optimist"
28
29
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.8
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-01 00:00:00.000000000 Z
11
+ date: 2022-07-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: parallel
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: htmlentities
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -39,7 +53,7 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: trollop
56
+ name: optimist
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
@@ -69,8 +83,10 @@ files:
69
83
  - Rakefile
70
84
  - bin/benchmark.rb
71
85
  - bin/wp2txt
72
- - data/testdata.bz2
73
- - error_log.txt
86
+ - data/output_samples/testdata_en.txt
87
+ - data/output_samples/testdata_ja.txt
88
+ - data/testdata_en.bz2
89
+ - data/testdata_ja.bz2
74
90
  - lib/wp2txt.rb
75
91
  - lib/wp2txt/article.rb
76
92
  - lib/wp2txt/mw_api.rb
@@ -83,7 +99,7 @@ files:
83
99
  homepage: http://github.com/yohasebe/wp2txt
84
100
  licenses: []
85
101
  metadata: {}
86
- post_install_message:
102
+ post_install_message:
87
103
  rdoc_options: []
88
104
  require_paths:
89
105
  - lib
@@ -98,9 +114,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
114
  - !ruby/object:Gem::Version
99
115
  version: '0'
100
116
  requirements: []
101
- rubyforge_project: wp2txt
102
- rubygems_version: 2.4.2
103
- signing_key:
117
+ rubygems_version: 3.3.3
118
+ signing_key:
104
119
  specification_version: 4
105
120
  summary: Wikipedia dump to text converter
106
121
  test_files:
data/error_log.txt DELETED
@@ -1 +0,0 @@
1
- [[アンパサンド]]