wp2txt 0.9.5.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wp2txt/utils.rb CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
77
77
  $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
78
  $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
79
  $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
+
81
+ $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
+ $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
+ $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
+ $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
+ $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
+
87
+ $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
+ $cleanup_regex_02 = Regexp.new('^File:.+$')
89
+ $cleanup_regex_03 = Regexp.new('^\|.*$')
90
+ $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
+ $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
+ $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
+ $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
+ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
+
80
96
  ###################################################
81
97
 
82
98
  module Wp2txt
@@ -104,11 +120,12 @@ module Wp2txt
104
120
  end
105
121
 
106
122
  def format_wiki!(text, has_retried = false)
123
+ remove_complex!(text)
124
+
107
125
  escape_nowiki!(text)
108
126
  process_interwiki_links!(text)
109
127
  process_external_links!(text)
110
128
  unescape_nowiki!(text)
111
-
112
129
  remove_directive!(text)
113
130
  remove_emphasis!(text)
114
131
  mndash!(text)
@@ -120,15 +137,18 @@ module Wp2txt
120
137
  end
121
138
 
122
139
  def cleanup!(text)
123
- text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
- text.gsub!(/^File:.+$/){""}
125
- text.gsub!(/^\|.*$/){""}
126
- text.gsub!(/^{{.*$/){""}
127
- text.gsub!(/^}}.*$/){""}
128
- text.gsub!(/\n\n\n+/m){"\n\n"}
140
+ text.gsub!($cleanup_regex_01){""}
141
+ text.gsub!($cleanup_regex_02){""}
142
+ text.gsub!($cleanup_regex_03){""}
143
+ text.gsub!($cleanup_regex_04){""}
144
+ text.gsub!($cleanup_regex_05){""}
145
+ text.gsub!($cleanup_regex_06){""}
146
+ text.gsub!($cleanup_regex_07){""}
147
+ text.gsub!($cleanup_regex_08){"\n\n"}
129
148
  text.strip!
130
149
  text << "\n\n"
131
150
  end
151
+
132
152
  #################### parser for nested structure ####################
133
153
 
134
154
  def process_nested_structure(scanner, left, right, &block)
@@ -217,12 +237,16 @@ module Wp2txt
217
237
  def process_external_links!(str)
218
238
  scanner = StringScanner.new(str)
219
239
  result = process_nested_structure(scanner, "[", "]") do |contents|
220
- parts = contents.split(" ", 2)
221
- case parts.size
222
- when 1
223
- parts.first || ""
240
+ if /\A\s.+\s\z/ =~ contents
241
+ " (#{contents.strip}) "
224
242
  else
225
- parts.last || ""
243
+ parts = contents.split(" ", 2)
244
+ case parts.size
245
+ when 1
246
+ parts.first || ""
247
+ else
248
+ parts.last || ""
249
+ end
226
250
  end
227
251
  end
228
252
  str.replace(result)
@@ -239,10 +263,6 @@ module Wp2txt
239
263
  result = process_nested_structure(scanner, "{", "}") do |contents|
240
264
  ""
241
265
  end
242
- scanner = StringScanner.new(result)
243
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
244
- ""
245
- end
246
266
  str.replace(result)
247
267
  end
248
268
 
@@ -310,7 +330,8 @@ module Wp2txt
310
330
  end
311
331
 
312
332
  def remove_html!(str)
313
- ["div", "gallery", "timeline"].each do |tag|
333
+ str.gsub!(/<[^<>]+\/>/){""}
334
+ ["div", "gallery", "timeline", "noinclude"].each do |tag|
314
335
  scanner = StringScanner.new(str)
315
336
  result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
316
337
  ""
@@ -320,11 +341,11 @@ module Wp2txt
320
341
  end
321
342
 
322
343
  def remove_complex!(str)
323
- str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
324
- str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
325
- str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
326
- str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
327
- str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
344
+ str.gsub!($complex_regex_01){"《#{$1}》"}
345
+ str.gsub!($complex_regex_02){""}
346
+ str.gsub!($complex_regex_03){""}
347
+ str.gsub!($complex_regex_04){""}
348
+ str.gsub!($complex_regex_05){""}
328
349
  end
329
350
 
330
351
  def make_reference!(str)
@@ -340,6 +361,8 @@ module Wp2txt
340
361
  parts = contents.split("|")
341
362
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
342
363
  parts.shift
364
+ elsif /\Alang\-/i =~ parts[0]
365
+ parts.shift
343
366
  elsif /\Alang=/i =~ parts[1]
344
367
  parts.shift
345
368
  end
@@ -422,7 +445,7 @@ module Wp2txt
422
445
  end
423
446
  end
424
447
 
425
- def rename(files)
448
+ def rename(files, ext = "txt")
426
449
  # num of digits necessary to name the last file generated
427
450
  maxwidth = 0
428
451
 
@@ -435,8 +458,9 @@ module Wp2txt
435
458
  newname= f.sub(/\-(\d+)\z/) do
436
459
  "-" + sprintf("%0#{maxwidth}d", $1.to_i)
437
460
  end
438
- File.rename(f, newname + ".txt")
461
+ File.rename(f, newname + ".#{ext}")
439
462
  end
463
+ return true
440
464
  end
441
465
 
442
466
  # convert int of seconds to string in the format 00:00:00
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.9.5.1"
2
+ VERSION = "1.0.0"
3
3
  end
data/lib/wp2txt.rb CHANGED
@@ -4,14 +4,8 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
6
  require "nokogiri"
7
- require "parallel"
8
-
9
- require 'etc'
10
- require 'pp'
11
7
  require "wp2txt/article"
12
8
  require "wp2txt/utils"
13
- require "wp2txt/progressbar"
14
- # require "wp2txt/mw_api"
15
9
 
16
10
  begin
17
11
  require "bzip2-ruby"
@@ -22,21 +16,14 @@ rescue LoadError
22
16
  end
23
17
 
24
18
  module Wp2txt
25
- class Runner
26
-
19
+ class Splitter
27
20
  include Wp2txt
28
-
29
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
30
- @parent = parent
21
+ def initialize(input_file, output_dir = ".", tfile_size = 10)
31
22
  @fp = nil
32
-
33
23
  @input_file = input_file
34
24
  @output_dir = output_dir
35
25
  @tfile_size = tfile_size
36
- @convert = convert
37
- @strip_tmarker = strip_tmarker
38
- num_cores_available = Etc.nprocessors
39
- @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
26
+ prepare
40
27
  end
41
28
 
42
29
  def file_size(file)
@@ -62,34 +49,9 @@ module Wp2txt
62
49
  time_elapsed = Time.now - origin
63
50
  size
64
51
  end
65
-
66
- # control the display of command line progressbar (or gui which is not available for now)
67
- def notify_parent(last = false)
68
- @last_time ||= Time.now.to_f
69
- @elapsed_sum ||= 0
70
- time_now = Time.now.to_f
71
- elapsed_from_last = (time_now - @last_time).to_i
72
-
73
- if elapsed_from_last > 0.3 || last
74
52
 
75
- @last_time = time_now
76
- @elapsed_sum += elapsed_from_last
77
- gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i
78
- elt_str = sec_to_str(@elapsed_sum)
79
- if last
80
- eta_str = "00:00:00"
81
- else
82
- lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0
83
- eta_sec = (@infile_size - @size_read) / lines_persec
84
- eta_str = sec_to_str(eta_sec)
85
- end
86
- @parent.prg_update(gvalue, elt_str, eta_str)
87
- end
88
- end
89
-
90
- # check the size of input file (bz2 or plain xml) when uncompressed
53
+ # check the size of input file (bz2 or plain xml) when decompressed
91
54
  def prepare
92
-
93
55
  # if output_dir is not specified, output in the same directory
94
56
  # as the imput file
95
57
  if !@output_dir && @input_file
@@ -101,28 +63,12 @@ module Wp2txt
101
63
  if /.bz2$/ =~ @input_file
102
64
  unless NO_BZ2
103
65
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
104
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
105
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
106
- @infile_size = file_size(file)
107
- @parent.msg("... Done.", 1)
108
- file.close
109
- file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
110
66
  else
111
67
  if RUBY_PLATFORM.index("win32")
112
68
  file = IO.popen("bunzip2.exe -c #{@input_file}")
113
69
  else
114
70
  file = IO.popen("bzip2 -c -d #{@input_file}")
115
71
  end
116
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
117
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
118
- @infile_size = file_size(file)
119
- @parent.msg("... Done.", 1)
120
- file.close # try to reopen since rewind method is unavailable
121
- if RUBY_PLATFORM.index("win32")
122
- file = IO.popen("bunzip2.exe -c #{@input_file}")
123
- else
124
- file = IO.popen("bzip2 -c -d #{@input_file}")
125
- end
126
72
  end
127
73
  else # meaning that it is a text file
128
74
  @infile_size = File.stat(@input_file).size
@@ -137,8 +83,6 @@ module Wp2txt
137
83
  @outfiles = []
138
84
  @outfiles << outfilename
139
85
  @fp = File.open(outfilename, "w")
140
- @parent.before
141
- @parent.data_set(@input_file, 100 * 100)
142
86
  @file_pointer = file
143
87
  return true
144
88
  end
@@ -190,13 +134,113 @@ module Wp2txt
190
134
  end
191
135
  end
192
136
 
137
+ def split_file
138
+ output_text = ""
139
+ end_flag = false
140
+ while text = get_newline
141
+ @count ||= 0;@count += 1;
142
+ @size_read ||=0
143
+ @size_read += text.bytesize
144
+ @total_size += text.bytesize
145
+ output_text << text
146
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
147
+ # never close the file until the end of the page even if end_flag is on
148
+ if end_flag && /<\/page/ =~ text
149
+ @fp.puts(output_text)
150
+ output_text = ""
151
+ @total_size = 0
152
+ end_flag = false
153
+ @fp.close
154
+ @file_index += 1
155
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
156
+ @outfiles << outfilename
157
+ @fp = File.open(outfilename, "w")
158
+ next
159
+ end
160
+ end
161
+ @fp.puts(output_text) if output_text != ""
162
+ @fp.close
163
+
164
+ if File.size(outfilename) == 0
165
+ File.delete(outfilename)
166
+ @outfiles.delete(outfilename)
167
+ end
168
+
169
+ rename(@outfiles, "xml")
170
+ end
171
+ end
172
+
173
+ class Runner
174
+ include Wp2txt
175
+
176
+ def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
177
+ @fp = nil
178
+ @input_file = input_file
179
+ @output_dir = output_dir
180
+ @strip_tmarker = strip_tmarker
181
+ @del_interfile = del_interfile
182
+ prepare
183
+ end
184
+
185
+ def prepare
186
+ @infile_size = File.stat(@input_file).size
187
+ file = open(@input_file)
188
+ @file_pointer = file
189
+ @outfile_base = File.basename(@input_file, ".*")
190
+ @total_size = 0
191
+ return true
192
+ end
193
+
194
+ def fill_buffer
195
+ while true do
196
+ begin
197
+ new_lines = @file_pointer.read(10485760)
198
+ rescue => e
199
+ return nil
200
+ end
201
+ return nil unless new_lines
202
+
203
+ # temp_buf is filled with text split by "\n"
204
+ temp_buf = []
205
+ ss = StringScanner.new(new_lines)
206
+ while ss.scan(/.*?\n/m)
207
+ temp_buf << ss[0]
208
+ end
209
+ temp_buf << ss.rest unless ss.eos?
210
+
211
+ new_first_line = temp_buf.shift
212
+ if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
213
+ @buffer.last << new_first_line
214
+ @buffer << ""
215
+ else
216
+ @buffer.last << new_first_line
217
+ end
218
+ @buffer += temp_buf unless temp_buf.empty?
219
+ if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
220
+ @buffer << ""
221
+ end
222
+ break if @buffer.size > 1
223
+ end
224
+ return true
225
+ end
226
+
227
+ def get_newline
228
+ @buffer ||= [""]
229
+ if @buffer.size == 1
230
+ return nil unless fill_buffer
231
+ end
232
+ if @buffer.empty?
233
+ return nil
234
+ else
235
+ new_line = @buffer.shift
236
+ return new_line
237
+ end
238
+ end
239
+
193
240
  def get_page
194
241
  inside_page = false
195
242
  page = ""
196
243
  while line = get_newline
197
- notify_parent
198
- @size_read ||=0; @size_read += line.bytesize
199
-
200
244
  if /<page>/ =~ line #
201
245
  page << line
202
246
  inside_page = true
@@ -215,22 +259,7 @@ module Wp2txt
215
259
  end
216
260
  end
217
261
 
218
- # call this method to do the job
219
262
  def extract_text(&block)
220
- prepare
221
- if @convert
222
- if block
223
- extract_and_convert(&block)
224
- else
225
- extract_and_convert
226
- end
227
- else
228
- # output the original xml only split to files of the specified size
229
- extract
230
- end
231
- end
232
-
233
- def extract_and_convert(&block)
234
263
  in_text = false
235
264
  in_message = false
236
265
  result_text = ""
@@ -241,17 +270,15 @@ module Wp2txt
241
270
  pages = []
242
271
  data_empty = false
243
272
 
244
- begin
273
+ while !data_empty
245
274
  page = get_page
246
275
  if page
247
276
  pages << page
248
277
  else
249
278
  data_empty = true
250
279
  end
251
- if data_empty || pages.size == @num_threads
252
- # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
- pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
254
- page_text = {:order => n, :data => nil}
280
+ if data_empty
281
+ pages.each do |page|
255
282
  xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
256
283
  xml = xmlns + page + "</mediawiki>"
257
284
 
@@ -270,79 +297,22 @@ module Wp2txt
270
297
  end
271
298
  end
272
299
  article = Article.new(text, title, @strip_tmarker)
273
- page_text[:data] = block.call(article)
300
+ page_text = block.call(article)
301
+ output_text << page_text
274
302
  end
275
- page_text
276
- end
277
- pages.clear
278
- pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
279
- pages_text.each do |page_text|
280
- output_text << page_text
281
- @count ||= 0; @count += 1;
282
- @total_size = output_text.bytesize
283
- # flagged when data exceeds the size of output file
284
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
285
303
  end
286
304
 
287
- #close the present file, then open a new one
288
- if end_flag
289
- cleanup!(output_text)
305
+ cleanup!(output_text)
306
+ if output_text.size > 0
307
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
308
+ @fp = File.open(outfilename, "w")
290
309
  @fp.puts(output_text)
291
- output_text = ""
292
- @total_size = 0
293
- end_flag = false
294
310
  @fp.close
295
- @file_index += 1
296
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
297
- @outfiles << outfilename
298
- @fp = File.open(outfilename, "w")
299
- next
300
311
  end
301
- end
302
- end while !data_empty
303
-
304
- if output_text != ""
305
- cleanup!(output_text)
306
- @fp.puts(output_text)
307
- end
308
- notify_parent(true)
309
- @parent.after
310
- @fp.close
311
- rename(@outfiles)
312
- @parent.msg("Processing finished", 1)
313
- end
314
-
315
- def extract
316
- output_text = ""
317
- end_flag = false
318
- while text = get_newline
319
- @count ||= 0;@count += 1;
320
- @size_read ||=0;@size_read += text.bytesize
321
- @total_size += text.bytesize
322
- output_text << text
323
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
324
- notify_parent
325
- # never close the file until the end of the page even if end_flag is on
326
- if end_flag && /<\/page/ =~ text
327
- @fp.puts(output_text)
312
+ File.delete(@input_file) if @del_interfile
328
313
  output_text = ""
329
- @total_size = 0
330
- end_flag = false
331
- @fp.close
332
- @file_index += 1
333
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
334
- @outfiles << outfilename
335
- @fp = File.open(outfilename, "w")
336
- next
337
314
  end
338
315
  end
339
- @fp.puts(output_text) if output_text != ""
340
- notify_parent(true)
341
- @parent.after
342
- @fp.close
343
- rename(@outfiles)
344
- @parent.msg("Processing finished", 1)
345
- end
316
+ end
346
317
  end
347
318
  end
348
-
data/spec/utils_spec.rb CHANGED
@@ -184,10 +184,6 @@ describe "Wp2txt" do
184
184
 
185
185
  describe "correct_inline_template!" do
186
186
  it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # correct_inline_template!(str_before)
190
- # expect(str_before).to eq str_after
191
187
  str_before = "{{MedalCountry | {{JPN}} }}"
192
188
  str_after = "JPN"
193
189
  correct_inline_template!(str_before)
@@ -197,11 +193,11 @@ describe "Wp2txt" do
197
193
  correct_inline_template!(str_before)
198
194
  expect(str_before).to eq str_after
199
195
  str_before = "{{a|b=c|d=f}}"
200
- str_after = "a"
196
+ str_after = "c"
201
197
  correct_inline_template!(str_before)
202
198
  expect(str_before).to eq str_after
203
199
  str_before = "{{a|b|{{c|d|e}}}}"
204
- str_after = "e"
200
+ str_after = "b"
205
201
  correct_inline_template!(str_before)
206
202
  expect(str_before).to eq str_after
207
203
  str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
@@ -210,18 +206,4 @@ describe "Wp2txt" do
210
206
  expect(str_before).to eq str_after
211
207
  end
212
208
  end
213
-
214
- # describe "expand_template" do
215
- # it "gets data corresponding to a given template using mediawiki api" do
216
- # uri = "http://en.wiktionary.org/w/api.php"
217
- # template = "{{en-verb}}"
218
- # word = "kick"
219
- # expanded = expand_template(uri, template, word)
220
- # html =<<EOD
221
- # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
222
- # EOD
223
- # html.strip!
224
- # expanded.should == html
225
- # end
226
- # end
227
- end
209
+ end
data/wp2txt.gemspec CHANGED
@@ -14,6 +14,7 @@ Gem::Specification.new do |s|
14
14
  s.rubyforge_project = "wp2txt"
15
15
 
16
16
  s.files = `git ls-files`.split("\n")
17
+ s.files -= ["data/*", "image/*"]
17
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
20
  s.require_paths = ["lib"]
@@ -23,7 +24,10 @@ Gem::Specification.new do |s|
23
24
  # s.add_development_dependency "rake"
24
25
 
25
26
  s.add_dependency "nokogiri"
27
+ s.add_dependency "ruby-progressbar"
26
28
  s.add_dependency "parallel"
27
29
  s.add_dependency "htmlentities"
28
30
  s.add_dependency "optimist"
31
+ s.add_dependency "pastel"
32
+ s.add_dependency "tty-spinner"
29
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.5.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-02 00:00:00.000000000 Z
11
+ date: 2022-08-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: ruby-progressbar
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: parallel
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,12 +80,39 @@ dependencies:
66
80
  - - ">="
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pastel
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: tty-spinner
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
69
111
  description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
70
112
  XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
71
113
  email:
72
114
  - yohasebe@gmail.com
73
115
  executables:
74
- - benchmark.rb
75
116
  - wp2txt
76
117
  extensions: []
77
118
  extra_rdoc_files: []
@@ -81,20 +122,20 @@ files:
81
122
  - LICENSE
82
123
  - README.md
83
124
  - Rakefile
84
- - bin/benchmark.rb
85
125
  - bin/wp2txt
86
126
  - data/output_samples/testdata_en.txt
87
- - data/output_samples/testdata_en_categories.txt
127
+ - data/output_samples/testdata_en_category.txt
88
128
  - data/output_samples/testdata_en_summary.txt
89
129
  - data/output_samples/testdata_ja.txt
90
- - data/output_samples/testdata_ja_categories.txt
130
+ - data/output_samples/testdata_ja_category.txt
91
131
  - data/output_samples/testdata_ja_summary.txt
92
132
  - data/testdata_en.bz2
93
133
  - data/testdata_ja.bz2
134
+ - image/screenshot.png
135
+ - image/wp2txt-logo.svg
136
+ - image/wp2txt.svg
94
137
  - lib/wp2txt.rb
95
138
  - lib/wp2txt/article.rb
96
- - lib/wp2txt/mw_api.rb
97
- - lib/wp2txt/progressbar.rb
98
139
  - lib/wp2txt/utils.rb
99
140
  - lib/wp2txt/version.rb
100
141
  - spec/spec_helper.rb
@@ -118,7 +159,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
159
  - !ruby/object:Gem::Version
119
160
  version: '0'
120
161
  requirements: []
121
- rubygems_version: 3.3.7
162
+ rubygems_version: 3.3.3
122
163
  signing_key:
123
164
  specification_version: 4
124
165
  summary: Wikipedia dump to text converter