wp2txt 0.7.8 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Binary file
File without changes
@@ -37,6 +37,10 @@ module Wp2txt
37
37
  def initialize(text, title = "", strip_tmarker = false)
38
38
  @title = title.strip
39
39
  @strip_tmarker = strip_tmarker
40
+ convert_characters!(text)
41
+ make_reference!(text)
42
+ remove_ref!(text)
43
+
40
44
  parse text
41
45
  end
42
46
 
@@ -58,6 +62,22 @@ module Wp2txt
58
62
  end
59
63
 
60
64
  case mode
65
+ when :mw_ml_template
66
+ scanner = StringScanner.new(line)
67
+ str= process_nested_structure(scanner, "{{", "}}") {""}
68
+ if $ml_template_end_regex =~ str
69
+ mode = nil
70
+ end
71
+ @elements.last.last << line
72
+ next
73
+ when :mw_ml_link
74
+ scanner = StringScanner.new(line)
75
+ str= process_nested_structure(scanner, "[[", "]]") {""}
76
+ if $ml_link_end_regex =~ str
77
+ mode = nil
78
+ end
79
+ @elements.last.last << line
80
+ next
61
81
  when :mw_table
62
82
  if $in_table_regex2 =~ line
63
83
  mode = nil
@@ -91,19 +111,29 @@ module Wp2txt
91
111
  end
92
112
 
93
113
  case line
114
+ when $isolated_template_regex
115
+ @elements << create_element(:mw_isolated_template, line)
116
+ when $isolated_tag_regex
117
+ @elements << create_element(:mw_isolated_tag, line)
94
118
  when $blank_line_regex
95
119
  @elements << create_element(:mw_blank, "\n")
96
120
  when $redirect_regex
97
121
  @elements << create_element(:mw_redirect, line)
98
- when $in_template_regex
99
- @elements << create_element(:mw_template, line)
122
+ # when $in_template_regex
123
+ # @elements << create_element(:mw_template, line)
100
124
  when $in_heading_regex
101
125
  line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
102
126
  @elements << create_element(:mw_heading, "\n" + line + "\n")
103
127
  when $in_inputbox_regex
104
128
  @elements << create_element(:mw_inputbox, line)
129
+ when $ml_template_onset_regex
130
+ @elements << create_element(:mw_ml_template, line)
131
+ mode = :mw_ml_template
132
+ when $ml_link_onset_regex
133
+ @elements << create_element(:mw_ml_link, line)
134
+ mode = :mw_ml_link
105
135
  when $in_inputbox_regex1
106
- mode = :mw_inputbox
136
+ mode = :mw_inputbox
107
137
  @elements << create_element(:mw_inputbox, line)
108
138
  when $in_source_regex
109
139
  @elements << create_element(:mw_source, line)
@@ -138,7 +168,7 @@ module Wp2txt
138
168
  when $in_link_regex
139
169
  @elements << create_element(:mw_link, line)
140
170
  else
141
- @elements << create_element(:mw_paragraph, line)
171
+ @elements << create_element(:mw_paragraph, "\n" + line)
142
172
  end
143
173
  end
144
174
  @elements
data/lib/wp2txt/utils.rb CHANGED
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
16
16
  $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
17
  $html_hash = Hash[*$entities.flatten]
18
18
  $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
-
20
- $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
19
+ $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
+ $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
+ $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
+ $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
+ $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
+ $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
21
25
  $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
22
-
23
26
  $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
24
27
  $in_inputbox_regex1 = Regexp.new('<inputbox>')
25
28
  $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
26
-
27
29
  $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
28
30
  $in_source_regex1 = Regexp.new('<source.*?>')
29
31
  $in_source_regex2 = Regexp.new('<\/source>')
30
-
31
32
  $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
32
33
  $in_math_regex1 = Regexp.new('<math.*?>')
33
34
  $in_math_regex2 = Regexp.new('<\/math>')
34
-
35
35
  $in_heading_regex = Regexp.new('^=+.*?=+$')
36
-
37
36
  $in_html_table_regex = Regexp.new('<table.*?><\/table>')
38
37
  $in_html_table_regex1 = Regexp.new('<table\b')
39
38
  $in_html_table_regex2 = Regexp.new('<\/\s*table>')
40
-
41
39
  $in_table_regex1 = Regexp.new('^\s*\{\|')
42
40
  $in_table_regex2 = Regexp.new('^\|\}.*?$')
43
-
44
41
  $in_unordered_regex = Regexp.new('^\*')
45
42
  $in_ordered_regex = Regexp.new('^\#')
46
43
  $in_pre_regex = Regexp.new('^ ')
47
44
  $in_definition_regex = Regexp.new('^[\;\:]')
48
-
49
45
  $blank_line_regex = Regexp.new('^\s*$')
50
-
51
46
  $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
52
-
53
47
  $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
54
48
  $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
55
-
56
49
  $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
57
50
  $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
58
51
  $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
75
68
  $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
76
69
  $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
77
70
 
71
+ $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
78
72
  $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
79
73
  $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
80
74
 
@@ -111,27 +105,26 @@ module Wp2txt
111
105
 
112
106
  def format_wiki!(text, has_retried = false)
113
107
  escape_nowiki!(text)
114
-
115
108
  process_interwiki_links!(text)
116
109
  process_external_links!(text)
117
-
118
110
  unescape_nowiki!(text)
119
- end
120
-
121
- def format_article!(text)
111
+ #####
122
112
  remove_directive!(text)
123
113
  remove_emphasis!(text)
124
114
  mndash!(text)
125
- make_reference!(text)
126
- format_ref!(text)
127
115
  remove_hr!(text)
128
116
  remove_tag!(text)
129
- convert_characters!(text)
130
- correct_inline_template!(text) unless $leave_template
131
- remove_templates!(text) unless $leave_template
132
- remove_table!(text) unless $leave_table
117
+ correct_inline_template!(text) unless $leave_inline_template
118
+ remove_templates!(text) unless $leave_inline_template
119
+ # remove_table!(text) unless $leave_table
133
120
  end
134
121
 
122
+ def cleanup!(text)
123
+ text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
+ text.gsub!(/\n\n\n+/m){"\n\n"}
125
+ text.strip!
126
+ text << "\n\n"
127
+ end
135
128
  #################### parser for nested structure ####################
136
129
 
137
130
  def process_nested_structure(scanner, left, right, &block)
@@ -237,6 +230,10 @@ module Wp2txt
237
230
  result = process_nested_structure(scanner, "{{", "}}") do |contents|
238
231
  ""
239
232
  end
233
+ scanner = StringScanner.new(result)
234
+ result = process_nested_structure(scanner, "{", "}") do |contents|
235
+ ""
236
+ end
240
237
  str.replace(result)
241
238
  end
242
239
 
@@ -295,48 +292,48 @@ module Wp2txt
295
292
  str.gsub!($mndash_regex, "–")
296
293
  end
297
294
 
298
- def remove_hr!(page)
299
- page.gsub!($remove_hr_regex, "")
295
+ def remove_hr!(str)
296
+ str.gsub!($remove_hr_regex, "")
300
297
  end
301
298
 
299
+ def remove_ref!(str)
300
+ str.gsub!($format_ref_regex){""}
301
+ end
302
+
302
303
  def make_reference!(str)
303
304
  str.gsub!($make_reference_regex_a){"\n"}
304
305
  str.gsub!($make_reference_regex_b){""}
305
306
  str.gsub!($make_reference_regex_c){"[ref]"}
306
307
  str.gsub!($make_reference_regex_d){"[/ref]"}
307
- str.gsub!($format_ref_regex){""} unless $leave_ref
308
- end
309
-
310
- def format_ref!(page)
311
- ###### do nothing for now
312
- # page.gsub!($format_ref_regex) do
313
- # end
314
308
  end
315
309
 
316
310
  def correct_inline_template!(str)
317
- str.gsub!($remove_inline_regex) do
318
- key = $1
319
- if $onset_bar_regex =~ key
320
- result = key
321
- elsif
322
- info = key.split("|")
323
- type_code = info.first
324
- case type_code
325
- when $type_code_regex
326
- out = info[-1]
327
- else
328
- if $leave_template
329
- out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
330
- else
331
- out = ""
311
+ scanner = StringScanner.new(str)
312
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
313
+ parts = contents.split("|")
314
+ # type_code = parts.first
315
+ # case type_code
316
+ # when $type_code_regex
317
+ # out = parts[-1]
318
+ # else
319
+ # case parts.size
320
+ # when 0
321
+ # out = ""
322
+ # when 1
323
+ # out = parts.first || ""
324
+ # else
325
+ # while parts.size > 2 && parts.last.split("=").size > 1
326
+ while parts.size > 1 && parts.last.split("=").size > 1
327
+ parts.pop
332
328
  end
333
- end
334
- out
335
- else
336
- ""
337
- end
329
+ out = parts.last || ""
330
+ # end
331
+ # end
332
+ out.strip
338
333
  end
334
+ str.replace result
339
335
  end
336
+
340
337
 
341
338
  #################### file related utilities ####################
342
339
 
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.8"
2
+ VERSION = "0.9.2"
3
3
  end
data/lib/wp2txt.rb CHANGED
@@ -3,13 +3,10 @@
3
3
 
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
- # require "rubygems"
7
- # require "bundler/setup"
8
-
9
6
  require "nokogiri"
10
- # require "oga"
11
- # require "ox"
7
+ require "parallel"
12
8
 
9
+ require 'etc'
13
10
  require 'pp'
14
11
  require "wp2txt/article"
15
12
  require "wp2txt/utils"
@@ -29,7 +26,7 @@ module Wp2txt
29
26
 
30
27
  include Wp2txt
31
28
 
32
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
29
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
33
30
  @parent = parent
34
31
  @fp = nil
35
32
 
@@ -38,6 +35,8 @@ module Wp2txt
38
35
  @tfile_size = tfile_size
39
36
  @convert = convert
40
37
  @strip_tmarker = strip_tmarker
38
+ num_cores_available = Etc.nprocessors
39
+ @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
41
40
  end
42
41
 
43
42
  def file_size(file)
@@ -102,6 +101,7 @@ module Wp2txt
102
101
  if /.bz2$/ =~ @input_file
103
102
  unless NO_BZ2
104
103
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
104
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
105
105
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
106
106
  @infile_size = file_size(file)
107
107
  @parent.msg("... Done.", 1)
@@ -113,6 +113,7 @@ module Wp2txt
113
113
  else
114
114
  file = IO.popen("bzip2 -c -d #{@input_file}")
115
115
  end
116
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
116
117
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
117
118
  @infile_size = file_size(file)
118
119
  @parent.msg("... Done.", 1)
@@ -237,81 +238,73 @@ module Wp2txt
237
238
  end_flag = false
238
239
  terminal_round = false
239
240
  output_text = ""
241
+ pages = []
242
+ data_empty = false
240
243
 
241
- while page = get_page
242
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
243
- xml = xmlns + page + "</mediawiki>"
244
-
245
- input = Nokogiri::XML(xml, nil, 'UTF-8')
246
- page = input.xpath("//xmlns:text").first
247
- pp_title = page.parent.parent.at_css "title"
248
- title = pp_title.content
249
- next if /\:/ =~ title
250
- text = page.content
251
-
252
- # input = Oga.parse_xml(xml)
253
- # page = input.xpath("//xmlns:text").first
254
- # title = page.parent.parent.xpath("//xmlns:title").first.text
255
- # next if /\:/ =~ title
256
- # text = page.text
244
+ begin
245
+ page = get_page
246
+ if page
247
+ pages << page
248
+ else
249
+ data_empty = true
250
+ end
251
+ if data_empty || pages.size == @num_threads
252
+ # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
+ pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
254
+ page_text = {:order => n, :data => nil}
255
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
256
+ xml = xmlns + page + "</mediawiki>"
257
257
 
258
- # input = Ox.load(xml, :encoding => "UTF-8")
259
- # title = ""
260
- # text = ""
261
- # input.nodes.first.nodes.each do |n|
262
- # if n.name == "title"
263
- # title = n.nodes.first
264
- # if /\:/ =~ title
265
- # title = ""
266
- # break
267
- # end
268
- # elsif n.name == "revision"
269
- # n.nodes.each do |o|
270
- # if o.name == "text"
271
- # text = o.nodes.first
272
- # break
273
- # end
274
- # end
275
- # end
276
- # end
277
- # next if title == "" || text == ""
258
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
259
+ page = input.xpath("//xmlns:text").first
260
+ pp_title = page.parent.parent.at_css "title"
261
+ title = pp_title.content
262
+ unless /\:/ =~ title
263
+ text = page.content
264
+ text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
265
+ num_of_newlines = content.count("\n")
266
+ if num_of_newlines == 0
267
+ ""
268
+ else
269
+ "\n" * num_of_newlines
270
+ end
271
+ end
272
+ article = Article.new(text, title, @strip_tmarker)
273
+ page_text[:data] = block.call(article)
274
+ end
275
+ page_text
276
+ end
277
+ pages.clear
278
+ pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
279
+ pages_text.each do |page_text|
280
+ output_text << page_text
281
+ @count ||= 0; @count += 1;
282
+ @total_size = output_text.bytesize
283
+ # flagged when data exceeds the size of output file
284
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
285
+ end
278
286
 
279
- # remove all comment texts
280
- # and insert as many number of new line chars included in
281
- # each comment instead
282
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
283
- num_of_newlines = content.count("\n")
284
- if num_of_newlines == 0
285
- ""
286
- else
287
- "\n" * num_of_newlines
287
+ #close the present file, then open a new one
288
+ if end_flag
289
+ cleanup!(output_text)
290
+ @fp.puts(output_text)
291
+ output_text = ""
292
+ @total_size = 0
293
+ end_flag = false
294
+ @fp.close
295
+ @file_index += 1
296
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
297
+ @outfiles << outfilename
298
+ @fp = File.open(outfilename, "w")
299
+ next
288
300
  end
289
301
  end
290
-
291
- @count ||= 0;@count += 1;
292
-
293
- article = Article.new(text, title, @strip_tmarker)
294
- output_text += block.call(article)
295
- @total_size = output_text.bytesize
302
+ end while !data_empty
296
303
 
297
- # flagged when data exceeds the size of output file
298
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
299
-
300
- #close the present file, then open a new one
301
- if end_flag
302
- @fp.puts(output_text)
303
- output_text = ""
304
- @total_size = 0
305
- end_flag = false
306
- @fp.close
307
- @file_index += 1
308
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
309
- @outfiles << outfilename
310
- @fp = File.open(outfilename, "w")
311
- next
312
- end
304
+ if output_text != ""
305
+ cleanup!(output_text)
306
+ @fp.puts(output_text)
313
307
  end
314
- @fp.puts(output_text) if output_text != ""
315
308
  notify_parent(true)
316
309
  @parent.after
317
310
  @fp.close
@@ -351,4 +344,5 @@ module Wp2txt
351
344
  @parent.msg("Processing finished", 1)
352
345
  end
353
346
  end
354
- end
347
+ end
348
+
data/spec/utils_spec.rb CHANGED
@@ -182,22 +182,34 @@ describe "Wp2txt" do
182
182
  end
183
183
  end
184
184
 
185
- # describe "process_template" do
186
- # it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # expect(process_template(str_before)).to eq str_after
190
- # str_before = "{{lang|en|Japan}}"
191
- # str_after = "Japan"
192
- # expect(process_template(str_before)).to eq str_after
193
- # str_before = "{{a|b=c|d=f}}"
194
- # str_after = "a"
195
- # expect(process_template(str_before)).to eq str_after
196
- # str_before = "{{a|b|{{c|d|e}}}}"
197
- # str_after = "e"
198
- # expect(process_template(str_before)).to eq str_after
199
- # end
200
- # end
185
+ describe "correct_inline_template!" do
186
+ it "removes brackets and leaving some text" do
187
+ # str_before = "{{}}"
188
+ # str_after = ""
189
+ # correct_inline_template!(str_before)
190
+ # expect(str_before).to eq str_after
191
+ str_before = "{{MedalCountry | {{JPN}} }}"
192
+ str_after = "JPN"
193
+ correct_inline_template!(str_before)
194
+ expect(str_before).to eq str_after
195
+ str_before = "{{lang|en|Japan}}"
196
+ str_after = "Japan"
197
+ correct_inline_template!(str_before)
198
+ expect(str_before).to eq str_after
199
+ str_before = "{{a|b=c|d=f}}"
200
+ str_after = "a"
201
+ correct_inline_template!(str_before)
202
+ expect(str_before).to eq str_after
203
+ str_before = "{{a|b|{{c|d|e}}}}"
204
+ str_after = "e"
205
+ correct_inline_template!(str_before)
206
+ expect(str_before).to eq str_after
207
+ str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
208
+ str_after = "日本人に多く見受けられる"
209
+ correct_inline_template!(str_before)
210
+ expect(str_before).to eq str_after
211
+ end
212
+ end
201
213
 
202
214
  # describe "expand_template" do
203
215
  # it "gets data corresponding to a given template using mediawiki api" do
data/wp2txt.gemspec CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
23
23
  # s.add_development_dependency "rake"
24
24
 
25
25
  s.add_dependency "nokogiri"
26
+ s.add_dependency "parallel"
26
27
  s.add_dependency "htmlentities"
27
- s.add_dependency "trollop"
28
+ s.add_dependency "optimist"
28
29
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.8
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-01 00:00:00.000000000 Z
11
+ date: 2022-07-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: parallel
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: htmlentities
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -39,7 +53,7 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: trollop
56
+ name: optimist
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
@@ -69,8 +83,10 @@ files:
69
83
  - Rakefile
70
84
  - bin/benchmark.rb
71
85
  - bin/wp2txt
72
- - data/testdata.bz2
73
- - error_log.txt
86
+ - data/output_samples/testdata_en.txt
87
+ - data/output_samples/testdata_ja.txt
88
+ - data/testdata_en.bz2
89
+ - data/testdata_ja.bz2
74
90
  - lib/wp2txt.rb
75
91
  - lib/wp2txt/article.rb
76
92
  - lib/wp2txt/mw_api.rb
@@ -83,7 +99,7 @@ files:
83
99
  homepage: http://github.com/yohasebe/wp2txt
84
100
  licenses: []
85
101
  metadata: {}
86
- post_install_message:
102
+ post_install_message:
87
103
  rdoc_options: []
88
104
  require_paths:
89
105
  - lib
@@ -98,9 +114,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
114
  - !ruby/object:Gem::Version
99
115
  version: '0'
100
116
  requirements: []
101
- rubyforge_project: wp2txt
102
- rubygems_version: 2.4.2
103
- signing_key:
117
+ rubygems_version: 3.3.3
118
+ signing_key:
104
119
  specification_version: 4
105
120
  summary: Wikipedia dump to text converter
106
121
  test_files:
data/error_log.txt DELETED
@@ -1 +0,0 @@
1
- [[アンパサンド]]