wp2txt 0.9.5.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wp2txt/utils.rb CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
77
77
  $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
78
  $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
79
  $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
+
81
+ $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
+ $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
+ $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
+ $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
+ $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
+
87
+ $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
+ $cleanup_regex_02 = Regexp.new('^File:.+$')
89
+ $cleanup_regex_03 = Regexp.new('^\|.*$')
90
+ $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
+ $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
+ $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
+ $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
+ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
+
80
96
  ###################################################
81
97
 
82
98
  module Wp2txt
@@ -104,11 +120,12 @@ module Wp2txt
104
120
  end
105
121
 
106
122
  def format_wiki!(text, has_retried = false)
123
+ remove_complex!(text)
124
+
107
125
  escape_nowiki!(text)
108
126
  process_interwiki_links!(text)
109
127
  process_external_links!(text)
110
128
  unescape_nowiki!(text)
111
-
112
129
  remove_directive!(text)
113
130
  remove_emphasis!(text)
114
131
  mndash!(text)
@@ -120,15 +137,18 @@ module Wp2txt
120
137
  end
121
138
 
122
139
  def cleanup!(text)
123
- text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
- text.gsub!(/^File:.+$/){""}
125
- text.gsub!(/^\|.*$/){""}
126
- text.gsub!(/^{{.*$/){""}
127
- text.gsub!(/^}}.*$/){""}
128
- text.gsub!(/\n\n\n+/m){"\n\n"}
140
+ text.gsub!($cleanup_regex_01){""}
141
+ text.gsub!($cleanup_regex_02){""}
142
+ text.gsub!($cleanup_regex_03){""}
143
+ text.gsub!($cleanup_regex_04){""}
144
+ text.gsub!($cleanup_regex_05){""}
145
+ text.gsub!($cleanup_regex_06){""}
146
+ text.gsub!($cleanup_regex_07){""}
147
+ text.gsub!($cleanup_regex_08){"\n\n"}
129
148
  text.strip!
130
149
  text << "\n\n"
131
150
  end
151
+
132
152
  #################### parser for nested structure ####################
133
153
 
134
154
  def process_nested_structure(scanner, left, right, &block)
@@ -217,12 +237,16 @@ module Wp2txt
217
237
  def process_external_links!(str)
218
238
  scanner = StringScanner.new(str)
219
239
  result = process_nested_structure(scanner, "[", "]") do |contents|
220
- parts = contents.split(" ", 2)
221
- case parts.size
222
- when 1
223
- parts.first || ""
240
+ if /\A\s.+\s\z/ =~ contents
241
+ " (#{contents.strip}) "
224
242
  else
225
- parts.last || ""
243
+ parts = contents.split(" ", 2)
244
+ case parts.size
245
+ when 1
246
+ parts.first || ""
247
+ else
248
+ parts.last || ""
249
+ end
226
250
  end
227
251
  end
228
252
  str.replace(result)
@@ -239,10 +263,6 @@ module Wp2txt
239
263
  result = process_nested_structure(scanner, "{", "}") do |contents|
240
264
  ""
241
265
  end
242
- scanner = StringScanner.new(result)
243
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
244
- ""
245
- end
246
266
  str.replace(result)
247
267
  end
248
268
 
@@ -310,7 +330,8 @@ module Wp2txt
310
330
  end
311
331
 
312
332
  def remove_html!(str)
313
- ["div", "gallery", "timeline"].each do |tag|
333
+ str.gsub!(/<[^<>]+\/>/){""}
334
+ ["div", "gallery", "timeline", "noinclude"].each do |tag|
314
335
  scanner = StringScanner.new(str)
315
336
  result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
316
337
  ""
@@ -320,11 +341,11 @@ module Wp2txt
320
341
  end
321
342
 
322
343
  def remove_complex!(str)
323
- str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
324
- str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
325
- str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
326
- str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
327
- str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
344
+ str.gsub!($complex_regex_01){"《#{$1}》"}
345
+ str.gsub!($complex_regex_02){""}
346
+ str.gsub!($complex_regex_03){""}
347
+ str.gsub!($complex_regex_04){""}
348
+ str.gsub!($complex_regex_05){""}
328
349
  end
329
350
 
330
351
  def make_reference!(str)
@@ -340,6 +361,8 @@ module Wp2txt
340
361
  parts = contents.split("|")
341
362
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
342
363
  parts.shift
364
+ elsif /\Alang\-/i =~ parts[0]
365
+ parts.shift
343
366
  elsif /\Alang=/i =~ parts[1]
344
367
  parts.shift
345
368
  end
@@ -422,7 +445,7 @@ module Wp2txt
422
445
  end
423
446
  end
424
447
 
425
- def rename(files)
448
+ def rename(files, ext = "txt")
426
449
  # num of digits necessary to name the last file generated
427
450
  maxwidth = 0
428
451
 
@@ -435,8 +458,9 @@ module Wp2txt
435
458
  newname= f.sub(/\-(\d+)\z/) do
436
459
  "-" + sprintf("%0#{maxwidth}d", $1.to_i)
437
460
  end
438
- File.rename(f, newname + ".txt")
461
+ File.rename(f, newname + ".#{ext}")
439
462
  end
463
+ return true
440
464
  end
441
465
 
442
466
  # convert int of seconds to string in the format 00:00:00
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.9.5.1"
2
+ VERSION = "1.0.0"
3
3
  end
data/lib/wp2txt.rb CHANGED
@@ -4,14 +4,8 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
6
  require "nokogiri"
7
- require "parallel"
8
-
9
- require 'etc'
10
- require 'pp'
11
7
  require "wp2txt/article"
12
8
  require "wp2txt/utils"
13
- require "wp2txt/progressbar"
14
- # require "wp2txt/mw_api"
15
9
 
16
10
  begin
17
11
  require "bzip2-ruby"
@@ -22,21 +16,14 @@ rescue LoadError
22
16
  end
23
17
 
24
18
  module Wp2txt
25
- class Runner
26
-
19
+ class Splitter
27
20
  include Wp2txt
28
-
29
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
30
- @parent = parent
21
+ def initialize(input_file, output_dir = ".", tfile_size = 10)
31
22
  @fp = nil
32
-
33
23
  @input_file = input_file
34
24
  @output_dir = output_dir
35
25
  @tfile_size = tfile_size
36
- @convert = convert
37
- @strip_tmarker = strip_tmarker
38
- num_cores_available = Etc.nprocessors
39
- @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
26
+ prepare
40
27
  end
41
28
 
42
29
  def file_size(file)
@@ -62,34 +49,9 @@ module Wp2txt
62
49
  time_elapsed = Time.now - origin
63
50
  size
64
51
  end
65
-
66
- # control the display of command line progressbar (or gui which is not available for now)
67
- def notify_parent(last = false)
68
- @last_time ||= Time.now.to_f
69
- @elapsed_sum ||= 0
70
- time_now = Time.now.to_f
71
- elapsed_from_last = (time_now - @last_time).to_i
72
-
73
- if elapsed_from_last > 0.3 || last
74
52
 
75
- @last_time = time_now
76
- @elapsed_sum += elapsed_from_last
77
- gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i
78
- elt_str = sec_to_str(@elapsed_sum)
79
- if last
80
- eta_str = "00:00:00"
81
- else
82
- lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0
83
- eta_sec = (@infile_size - @size_read) / lines_persec
84
- eta_str = sec_to_str(eta_sec)
85
- end
86
- @parent.prg_update(gvalue, elt_str, eta_str)
87
- end
88
- end
89
-
90
- # check the size of input file (bz2 or plain xml) when uncompressed
53
+ # check the size of input file (bz2 or plain xml) when decompressed
91
54
  def prepare
92
-
93
55
  # if output_dir is not specified, output in the same directory
94
56
  # as the imput file
95
57
  if !@output_dir && @input_file
@@ -101,28 +63,12 @@ module Wp2txt
101
63
  if /.bz2$/ =~ @input_file
102
64
  unless NO_BZ2
103
65
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
104
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
105
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
106
- @infile_size = file_size(file)
107
- @parent.msg("... Done.", 1)
108
- file.close
109
- file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
110
66
  else
111
67
  if RUBY_PLATFORM.index("win32")
112
68
  file = IO.popen("bunzip2.exe -c #{@input_file}")
113
69
  else
114
70
  file = IO.popen("bzip2 -c -d #{@input_file}")
115
71
  end
116
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
117
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
118
- @infile_size = file_size(file)
119
- @parent.msg("... Done.", 1)
120
- file.close # try to reopen since rewind method is unavailable
121
- if RUBY_PLATFORM.index("win32")
122
- file = IO.popen("bunzip2.exe -c #{@input_file}")
123
- else
124
- file = IO.popen("bzip2 -c -d #{@input_file}")
125
- end
126
72
  end
127
73
  else # meaning that it is a text file
128
74
  @infile_size = File.stat(@input_file).size
@@ -137,8 +83,6 @@ module Wp2txt
137
83
  @outfiles = []
138
84
  @outfiles << outfilename
139
85
  @fp = File.open(outfilename, "w")
140
- @parent.before
141
- @parent.data_set(@input_file, 100 * 100)
142
86
  @file_pointer = file
143
87
  return true
144
88
  end
@@ -190,13 +134,113 @@ module Wp2txt
190
134
  end
191
135
  end
192
136
 
137
+ def split_file
138
+ output_text = ""
139
+ end_flag = false
140
+ while text = get_newline
141
+ @count ||= 0;@count += 1;
142
+ @size_read ||=0
143
+ @size_read += text.bytesize
144
+ @total_size += text.bytesize
145
+ output_text << text
146
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
147
+ # never close the file until the end of the page even if end_flag is on
148
+ if end_flag && /<\/page/ =~ text
149
+ @fp.puts(output_text)
150
+ output_text = ""
151
+ @total_size = 0
152
+ end_flag = false
153
+ @fp.close
154
+ @file_index += 1
155
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
156
+ @outfiles << outfilename
157
+ @fp = File.open(outfilename, "w")
158
+ next
159
+ end
160
+ end
161
+ @fp.puts(output_text) if output_text != ""
162
+ @fp.close
163
+
164
+ if File.size(outfilename) == 0
165
+ File.delete(outfilename)
166
+ @outfiles.delete(outfilename)
167
+ end
168
+
169
+ rename(@outfiles, "xml")
170
+ end
171
+ end
172
+
173
+ class Runner
174
+ include Wp2txt
175
+
176
+ def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
177
+ @fp = nil
178
+ @input_file = input_file
179
+ @output_dir = output_dir
180
+ @strip_tmarker = strip_tmarker
181
+ @del_interfile = del_interfile
182
+ prepare
183
+ end
184
+
185
+ def prepare
186
+ @infile_size = File.stat(@input_file).size
187
+ file = open(@input_file)
188
+ @file_pointer = file
189
+ @outfile_base = File.basename(@input_file, ".*")
190
+ @total_size = 0
191
+ return true
192
+ end
193
+
194
+ def fill_buffer
195
+ while true do
196
+ begin
197
+ new_lines = @file_pointer.read(10485760)
198
+ rescue => e
199
+ return nil
200
+ end
201
+ return nil unless new_lines
202
+
203
+ # temp_buf is filled with text split by "\n"
204
+ temp_buf = []
205
+ ss = StringScanner.new(new_lines)
206
+ while ss.scan(/.*?\n/m)
207
+ temp_buf << ss[0]
208
+ end
209
+ temp_buf << ss.rest unless ss.eos?
210
+
211
+ new_first_line = temp_buf.shift
212
+ if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
213
+ @buffer.last << new_first_line
214
+ @buffer << ""
215
+ else
216
+ @buffer.last << new_first_line
217
+ end
218
+ @buffer += temp_buf unless temp_buf.empty?
219
+ if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
220
+ @buffer << ""
221
+ end
222
+ break if @buffer.size > 1
223
+ end
224
+ return true
225
+ end
226
+
227
+ def get_newline
228
+ @buffer ||= [""]
229
+ if @buffer.size == 1
230
+ return nil unless fill_buffer
231
+ end
232
+ if @buffer.empty?
233
+ return nil
234
+ else
235
+ new_line = @buffer.shift
236
+ return new_line
237
+ end
238
+ end
239
+
193
240
  def get_page
194
241
  inside_page = false
195
242
  page = ""
196
243
  while line = get_newline
197
- notify_parent
198
- @size_read ||=0; @size_read += line.bytesize
199
-
200
244
  if /<page>/ =~ line #
201
245
  page << line
202
246
  inside_page = true
@@ -215,22 +259,7 @@ module Wp2txt
215
259
  end
216
260
  end
217
261
 
218
- # call this method to do the job
219
262
  def extract_text(&block)
220
- prepare
221
- if @convert
222
- if block
223
- extract_and_convert(&block)
224
- else
225
- extract_and_convert
226
- end
227
- else
228
- # output the original xml only split to files of the specified size
229
- extract
230
- end
231
- end
232
-
233
- def extract_and_convert(&block)
234
263
  in_text = false
235
264
  in_message = false
236
265
  result_text = ""
@@ -241,17 +270,15 @@ module Wp2txt
241
270
  pages = []
242
271
  data_empty = false
243
272
 
244
- begin
273
+ while !data_empty
245
274
  page = get_page
246
275
  if page
247
276
  pages << page
248
277
  else
249
278
  data_empty = true
250
279
  end
251
- if data_empty || pages.size == @num_threads
252
- # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
- pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
254
- page_text = {:order => n, :data => nil}
280
+ if data_empty
281
+ pages.each do |page|
255
282
  xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
256
283
  xml = xmlns + page + "</mediawiki>"
257
284
 
@@ -270,79 +297,22 @@ module Wp2txt
270
297
  end
271
298
  end
272
299
  article = Article.new(text, title, @strip_tmarker)
273
- page_text[:data] = block.call(article)
300
+ page_text = block.call(article)
301
+ output_text << page_text
274
302
  end
275
- page_text
276
- end
277
- pages.clear
278
- pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
279
- pages_text.each do |page_text|
280
- output_text << page_text
281
- @count ||= 0; @count += 1;
282
- @total_size = output_text.bytesize
283
- # flagged when data exceeds the size of output file
284
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
285
303
  end
286
304
 
287
- #close the present file, then open a new one
288
- if end_flag
289
- cleanup!(output_text)
305
+ cleanup!(output_text)
306
+ if output_text.size > 0
307
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
308
+ @fp = File.open(outfilename, "w")
290
309
  @fp.puts(output_text)
291
- output_text = ""
292
- @total_size = 0
293
- end_flag = false
294
310
  @fp.close
295
- @file_index += 1
296
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
297
- @outfiles << outfilename
298
- @fp = File.open(outfilename, "w")
299
- next
300
311
  end
301
- end
302
- end while !data_empty
303
-
304
- if output_text != ""
305
- cleanup!(output_text)
306
- @fp.puts(output_text)
307
- end
308
- notify_parent(true)
309
- @parent.after
310
- @fp.close
311
- rename(@outfiles)
312
- @parent.msg("Processing finished", 1)
313
- end
314
-
315
- def extract
316
- output_text = ""
317
- end_flag = false
318
- while text = get_newline
319
- @count ||= 0;@count += 1;
320
- @size_read ||=0;@size_read += text.bytesize
321
- @total_size += text.bytesize
322
- output_text << text
323
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
324
- notify_parent
325
- # never close the file until the end of the page even if end_flag is on
326
- if end_flag && /<\/page/ =~ text
327
- @fp.puts(output_text)
312
+ File.delete(@input_file) if @del_interfile
328
313
  output_text = ""
329
- @total_size = 0
330
- end_flag = false
331
- @fp.close
332
- @file_index += 1
333
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
334
- @outfiles << outfilename
335
- @fp = File.open(outfilename, "w")
336
- next
337
314
  end
338
315
  end
339
- @fp.puts(output_text) if output_text != ""
340
- notify_parent(true)
341
- @parent.after
342
- @fp.close
343
- rename(@outfiles)
344
- @parent.msg("Processing finished", 1)
345
- end
316
+ end
346
317
  end
347
318
  end
348
-
data/spec/utils_spec.rb CHANGED
@@ -184,10 +184,6 @@ describe "Wp2txt" do
184
184
 
185
185
  describe "correct_inline_template!" do
186
186
  it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # correct_inline_template!(str_before)
190
- # expect(str_before).to eq str_after
191
187
  str_before = "{{MedalCountry | {{JPN}} }}"
192
188
  str_after = "JPN"
193
189
  correct_inline_template!(str_before)
@@ -197,11 +193,11 @@ describe "Wp2txt" do
197
193
  correct_inline_template!(str_before)
198
194
  expect(str_before).to eq str_after
199
195
  str_before = "{{a|b=c|d=f}}"
200
- str_after = "a"
196
+ str_after = "c"
201
197
  correct_inline_template!(str_before)
202
198
  expect(str_before).to eq str_after
203
199
  str_before = "{{a|b|{{c|d|e}}}}"
204
- str_after = "e"
200
+ str_after = "b"
205
201
  correct_inline_template!(str_before)
206
202
  expect(str_before).to eq str_after
207
203
  str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
@@ -210,18 +206,4 @@ describe "Wp2txt" do
210
206
  expect(str_before).to eq str_after
211
207
  end
212
208
  end
213
-
214
- # describe "expand_template" do
215
- # it "gets data corresponding to a given template using mediawiki api" do
216
- # uri = "http://en.wiktionary.org/w/api.php"
217
- # template = "{{en-verb}}"
218
- # word = "kick"
219
- # expanded = expand_template(uri, template, word)
220
- # html =<<EOD
221
- # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
222
- # EOD
223
- # html.strip!
224
- # expanded.should == html
225
- # end
226
- # end
227
- end
209
+ end
data/wp2txt.gemspec CHANGED
@@ -14,6 +14,7 @@ Gem::Specification.new do |s|
14
14
  s.rubyforge_project = "wp2txt"
15
15
 
16
16
  s.files = `git ls-files`.split("\n")
17
+ s.files -= ["data/*", "image/*"]
17
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
20
  s.require_paths = ["lib"]
@@ -23,7 +24,10 @@ Gem::Specification.new do |s|
23
24
  # s.add_development_dependency "rake"
24
25
 
25
26
  s.add_dependency "nokogiri"
27
+ s.add_dependency "ruby-progressbar"
26
28
  s.add_dependency "parallel"
27
29
  s.add_dependency "htmlentities"
28
30
  s.add_dependency "optimist"
31
+ s.add_dependency "pastel"
32
+ s.add_dependency "tty-spinner"
29
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.5.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-02 00:00:00.000000000 Z
11
+ date: 2022-08-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: ruby-progressbar
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: parallel
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,12 +80,39 @@ dependencies:
66
80
  - - ">="
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pastel
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: tty-spinner
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
69
111
  description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
70
112
  XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
71
113
  email:
72
114
  - yohasebe@gmail.com
73
115
  executables:
74
- - benchmark.rb
75
116
  - wp2txt
76
117
  extensions: []
77
118
  extra_rdoc_files: []
@@ -81,20 +122,20 @@ files:
81
122
  - LICENSE
82
123
  - README.md
83
124
  - Rakefile
84
- - bin/benchmark.rb
85
125
  - bin/wp2txt
86
126
  - data/output_samples/testdata_en.txt
87
- - data/output_samples/testdata_en_categories.txt
127
+ - data/output_samples/testdata_en_category.txt
88
128
  - data/output_samples/testdata_en_summary.txt
89
129
  - data/output_samples/testdata_ja.txt
90
- - data/output_samples/testdata_ja_categories.txt
130
+ - data/output_samples/testdata_ja_category.txt
91
131
  - data/output_samples/testdata_ja_summary.txt
92
132
  - data/testdata_en.bz2
93
133
  - data/testdata_ja.bz2
134
+ - image/screenshot.png
135
+ - image/wp2txt-logo.svg
136
+ - image/wp2txt.svg
94
137
  - lib/wp2txt.rb
95
138
  - lib/wp2txt/article.rb
96
- - lib/wp2txt/mw_api.rb
97
- - lib/wp2txt/progressbar.rb
98
139
  - lib/wp2txt/utils.rb
99
140
  - lib/wp2txt/version.rb
100
141
  - spec/spec_helper.rb
@@ -118,7 +159,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
159
  - !ruby/object:Gem::Version
119
160
  version: '0'
120
161
  requirements: []
121
- rubygems_version: 3.3.7
162
+ rubygems_version: 3.3.3
122
163
  signing_key:
123
164
  specification_version: 4
124
165
  summary: Wikipedia dump to text converter