wp2txt 0.9.5 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +134 -57
- data/bin/wp2txt +149 -95
- data/data/output_samples/testdata_en.txt +171 -1247
- data/data/output_samples/{testdata_en_categories.txt → testdata_en_category.txt} +1 -1
- data/data/output_samples/testdata_en_summary.txt +28 -20
- data/data/output_samples/testdata_ja.txt +10359 -17093
- data/data/output_samples/{testdata_ja_categories.txt → testdata_ja_category.txt} +30 -30
- data/data/output_samples/testdata_ja_summary.txt +36 -160
- data/image/screenshot.png +0 -0
- data/image/wp2txt-logo.svg +16 -0
- data/image/wp2txt.svg +31 -0
- data/lib/wp2txt/article.rb +1 -3
- data/lib/wp2txt/utils.rb +92 -68
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +154 -171
- data/spec/utils_spec.rb +3 -21
- data/wp2txt.gemspec +7 -3
- metadata +54 -12
- data/bin/benchmark.rb +0 -76
- data/lib/wp2txt/mw_api.rb +0 -65
- data/lib/wp2txt/progressbar.rb +0 -305
data/lib/wp2txt.rb
CHANGED
@@ -4,42 +4,25 @@
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
6
|
require "nokogiri"
|
7
|
-
require "parallel"
|
8
|
-
|
9
|
-
require 'etc'
|
10
|
-
require 'pp'
|
11
7
|
require "wp2txt/article"
|
12
8
|
require "wp2txt/utils"
|
13
|
-
require "wp2txt/progressbar"
|
14
|
-
# require "wp2txt/mw_api"
|
15
|
-
|
16
|
-
begin
|
17
|
-
require "bzip2-ruby"
|
18
|
-
NO_BZ2 = false
|
19
|
-
rescue LoadError
|
20
|
-
# in case bzip2-ruby gem is not available
|
21
|
-
NO_BZ2 = true
|
22
|
-
end
|
23
9
|
|
24
10
|
module Wp2txt
|
25
|
-
class
|
26
|
-
|
11
|
+
class Splitter
|
27
12
|
include Wp2txt
|
28
|
-
|
29
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
|
30
|
-
@parent = parent
|
13
|
+
def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
|
31
14
|
@fp = nil
|
32
|
-
|
33
15
|
@input_file = input_file
|
34
16
|
@output_dir = output_dir
|
35
17
|
@tfile_size = tfile_size
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
@
|
18
|
+
if bz2_gem
|
19
|
+
require "bzip2-ruby"
|
20
|
+
end
|
21
|
+
@bz2_gem = bz2_gem
|
22
|
+
prepare
|
40
23
|
end
|
41
|
-
|
42
|
-
def file_size(file)
|
24
|
+
|
25
|
+
def file_size(file)
|
43
26
|
origin = Time.now
|
44
27
|
size = 0; unit = 10485760; star = 0; before = Time.now.to_f
|
45
28
|
error_count = 10
|
@@ -49,7 +32,7 @@ module Wp2txt
|
|
49
32
|
rescue => e
|
50
33
|
a = nil
|
51
34
|
end
|
52
|
-
break unless a
|
35
|
+
break unless a
|
53
36
|
|
54
37
|
present = Time.now.to_f
|
55
38
|
size += a.size
|
@@ -57,88 +40,62 @@ module Wp2txt
|
|
57
40
|
star = 0 if star > 10
|
58
41
|
star += 1
|
59
42
|
before = present
|
60
|
-
end
|
43
|
+
end
|
61
44
|
end
|
62
45
|
time_elapsed = Time.now - origin
|
63
46
|
size
|
64
47
|
end
|
65
|
-
|
66
|
-
# control the display of command line progressbar (or gui which is not available for now)
|
67
|
-
def notify_parent(last = false)
|
68
|
-
@last_time ||= Time.now.to_f
|
69
|
-
@elapsed_sum ||= 0
|
70
|
-
time_now = Time.now.to_f
|
71
|
-
elapsed_from_last = (time_now - @last_time).to_i
|
72
|
-
|
73
|
-
if elapsed_from_last > 0.3 || last
|
74
48
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
49
|
+
# check if a given command exists: return the path if it does, return false if not
|
50
|
+
def command_exist?(command)
|
51
|
+
basename = File.basename(command)
|
52
|
+
path = ""
|
53
|
+
print "Checking #{basename}: "
|
54
|
+
if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
|
55
|
+
puts "detected [#{path}]"
|
56
|
+
return path.strip
|
57
|
+
elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
|
58
|
+
puts "detected [#{path}]"
|
59
|
+
return path.strip
|
60
|
+
else
|
61
|
+
puts "not found"
|
62
|
+
return false
|
87
63
|
end
|
88
64
|
end
|
89
65
|
|
90
|
-
# check the size of input file (bz2 or plain xml) when
|
66
|
+
# check the size of input file (bz2 or plain xml) when decompressed
|
91
67
|
def prepare
|
92
|
-
|
93
68
|
# if output_dir is not specified, output in the same directory
|
94
69
|
# as the imput file
|
95
70
|
if !@output_dir && @input_file
|
96
71
|
@output_dir = File.dirname(@input_file)
|
97
72
|
end
|
98
73
|
|
99
|
-
# if input file is bz2 compressed, use bz2-ruby if available,
|
100
|
-
# use command line bzip2 program otherwise.
|
101
74
|
if /.bz2$/ =~ @input_file
|
102
|
-
|
103
|
-
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
104
|
-
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
105
|
-
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
106
|
-
@infile_size = file_size(file)
|
107
|
-
@parent.msg("... Done.", 1)
|
108
|
-
file.close
|
75
|
+
if @bz2_gem
|
109
76
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
77
|
+
elsif RUBY_PLATFORM.index("win32")
|
78
|
+
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
110
79
|
else
|
111
|
-
if
|
112
|
-
|
113
|
-
|
114
|
-
file = IO.popen("
|
115
|
-
end
|
116
|
-
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
117
|
-
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
118
|
-
@infile_size = file_size(file)
|
119
|
-
@parent.msg("... Done.", 1)
|
120
|
-
file.close # try to reopen since rewind method is unavailable
|
121
|
-
if RUBY_PLATFORM.index("win32")
|
122
|
-
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
123
|
-
else
|
124
|
-
file = IO.popen("bzip2 -c -d #{@input_file}")
|
80
|
+
if bzpath = command_exist?("lbzip2") ||
|
81
|
+
command_exist?("pbzip2") ||
|
82
|
+
command_exist?("bzip2")
|
83
|
+
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
125
84
|
end
|
126
|
-
end
|
85
|
+
end
|
127
86
|
else # meaning that it is a text file
|
128
87
|
@infile_size = File.stat(@input_file).size
|
129
88
|
file = open(@input_file)
|
130
89
|
end
|
131
90
|
|
132
91
|
#create basename of output file
|
133
|
-
@outfile_base = File.basename(@input_file, ".*") + "-"
|
92
|
+
@outfile_base = File.basename(@input_file, ".*") + "-"
|
134
93
|
@total_size = 0
|
135
94
|
@file_index = 1
|
136
95
|
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
137
96
|
@outfiles = []
|
138
97
|
@outfiles << outfilename
|
139
|
-
@fp = File.open(outfilename, "w")
|
140
|
-
@parent.before
|
141
|
-
@parent.data_set(@input_file, 100 * 100)
|
98
|
+
@fp = File.open(outfilename, "w")
|
142
99
|
@file_pointer = file
|
143
100
|
return true
|
144
101
|
end
|
@@ -156,7 +113,110 @@ module Wp2txt
|
|
156
113
|
# temp_buf is filled with text split by "\n"
|
157
114
|
temp_buf = []
|
158
115
|
ss = StringScanner.new(new_lines)
|
159
|
-
while ss.scan(/.*?\n/m)
|
116
|
+
while ss.scan(/.*?\n/m)
|
117
|
+
temp_buf << ss[0]
|
118
|
+
end
|
119
|
+
temp_buf << ss.rest unless ss.eos?
|
120
|
+
|
121
|
+
new_first_line = temp_buf.shift
|
122
|
+
if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
123
|
+
@buffer.last << new_first_line
|
124
|
+
@buffer << ""
|
125
|
+
else
|
126
|
+
@buffer.last << new_first_line
|
127
|
+
end
|
128
|
+
@buffer += temp_buf unless temp_buf.empty?
|
129
|
+
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
130
|
+
@buffer << ""
|
131
|
+
end
|
132
|
+
break if @buffer.size > 1
|
133
|
+
end
|
134
|
+
return true
|
135
|
+
end
|
136
|
+
|
137
|
+
def get_newline
|
138
|
+
@buffer ||= [""]
|
139
|
+
if @buffer.size == 1
|
140
|
+
return nil unless fill_buffer
|
141
|
+
end
|
142
|
+
if @buffer.empty?
|
143
|
+
return nil
|
144
|
+
else
|
145
|
+
new_line = @buffer.shift
|
146
|
+
return new_line
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def split_file
|
151
|
+
output_text = ""
|
152
|
+
end_flag = false
|
153
|
+
while text = get_newline
|
154
|
+
@count ||= 0;@count += 1;
|
155
|
+
@size_read ||=0
|
156
|
+
@size_read += text.bytesize
|
157
|
+
@total_size += text.bytesize
|
158
|
+
output_text << text
|
159
|
+
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
160
|
+
# never close the file until the end of the page even if end_flag is on
|
161
|
+
if end_flag && /<\/page/ =~ text
|
162
|
+
@fp.puts(output_text)
|
163
|
+
output_text = ""
|
164
|
+
@total_size = 0
|
165
|
+
end_flag = false
|
166
|
+
@fp.close
|
167
|
+
@file_index += 1
|
168
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
169
|
+
@outfiles << outfilename
|
170
|
+
@fp = File.open(outfilename, "w")
|
171
|
+
next
|
172
|
+
end
|
173
|
+
end
|
174
|
+
@fp.puts(output_text) if output_text != ""
|
175
|
+
@fp.close
|
176
|
+
|
177
|
+
if File.size(outfilename) == 0
|
178
|
+
File.delete(outfilename)
|
179
|
+
@outfiles.delete(outfilename)
|
180
|
+
end
|
181
|
+
|
182
|
+
rename(@outfiles, "xml")
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class Runner
|
187
|
+
include Wp2txt
|
188
|
+
|
189
|
+
def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
|
190
|
+
@fp = nil
|
191
|
+
@input_file = input_file
|
192
|
+
@output_dir = output_dir
|
193
|
+
@strip_tmarker = strip_tmarker
|
194
|
+
@del_interfile = del_interfile
|
195
|
+
prepare
|
196
|
+
end
|
197
|
+
|
198
|
+
def prepare
|
199
|
+
@infile_size = File.stat(@input_file).size
|
200
|
+
file = open(@input_file)
|
201
|
+
@file_pointer = file
|
202
|
+
@outfile_base = File.basename(@input_file, ".*")
|
203
|
+
@total_size = 0
|
204
|
+
return true
|
205
|
+
end
|
206
|
+
|
207
|
+
def fill_buffer
|
208
|
+
while true do
|
209
|
+
begin
|
210
|
+
new_lines = @file_pointer.read(10485760)
|
211
|
+
rescue => e
|
212
|
+
return nil
|
213
|
+
end
|
214
|
+
return nil unless new_lines
|
215
|
+
|
216
|
+
# temp_buf is filled with text split by "\n"
|
217
|
+
temp_buf = []
|
218
|
+
ss = StringScanner.new(new_lines)
|
219
|
+
while ss.scan(/.*?\n/m)
|
160
220
|
temp_buf << ss[0]
|
161
221
|
end
|
162
222
|
temp_buf << ss.rest unless ss.eos?
|
@@ -178,25 +238,22 @@ module Wp2txt
|
|
178
238
|
end
|
179
239
|
|
180
240
|
def get_newline
|
181
|
-
@buffer ||= [""]
|
241
|
+
@buffer ||= [""]
|
182
242
|
if @buffer.size == 1
|
183
243
|
return nil unless fill_buffer
|
184
244
|
end
|
185
245
|
if @buffer.empty?
|
186
246
|
return nil
|
187
|
-
else
|
247
|
+
else
|
188
248
|
new_line = @buffer.shift
|
189
249
|
return new_line
|
190
|
-
end
|
250
|
+
end
|
191
251
|
end
|
192
252
|
|
193
253
|
def get_page
|
194
254
|
inside_page = false
|
195
255
|
page = ""
|
196
256
|
while line = get_newline
|
197
|
-
notify_parent
|
198
|
-
@size_read ||=0; @size_read += line.bytesize
|
199
|
-
|
200
257
|
if /<page>/ =~ line #
|
201
258
|
page << line
|
202
259
|
inside_page = true
|
@@ -215,22 +272,7 @@ module Wp2txt
|
|
215
272
|
end
|
216
273
|
end
|
217
274
|
|
218
|
-
# call this method to do the job
|
219
275
|
def extract_text(&block)
|
220
|
-
prepare
|
221
|
-
if @convert
|
222
|
-
if block
|
223
|
-
extract_and_convert(&block)
|
224
|
-
else
|
225
|
-
extract_and_convert
|
226
|
-
end
|
227
|
-
else
|
228
|
-
# output the original xml only split to files of the specified size
|
229
|
-
extract
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
def extract_and_convert(&block)
|
234
276
|
in_text = false
|
235
277
|
in_message = false
|
236
278
|
result_text = ""
|
@@ -241,17 +283,15 @@ module Wp2txt
|
|
241
283
|
pages = []
|
242
284
|
data_empty = false
|
243
285
|
|
244
|
-
|
286
|
+
while !data_empty
|
245
287
|
page = get_page
|
246
288
|
if page
|
247
289
|
pages << page
|
248
290
|
else
|
249
291
|
data_empty = true
|
250
292
|
end
|
251
|
-
if data_empty
|
252
|
-
|
253
|
-
pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
254
|
-
page_text = {:order => n, :data => nil}
|
293
|
+
if data_empty
|
294
|
+
pages.each do |page|
|
255
295
|
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
256
296
|
xml = xmlns + page + "</mediawiki>"
|
257
297
|
|
@@ -270,79 +310,22 @@ module Wp2txt
|
|
270
310
|
end
|
271
311
|
end
|
272
312
|
article = Article.new(text, title, @strip_tmarker)
|
273
|
-
page_text
|
313
|
+
page_text = block.call(article)
|
314
|
+
output_text << page_text
|
274
315
|
end
|
275
|
-
page_text
|
276
|
-
end
|
277
|
-
pages.clear
|
278
|
-
pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
|
279
|
-
pages_text.each do |page_text|
|
280
|
-
output_text << page_text
|
281
|
-
@count ||= 0; @count += 1;
|
282
|
-
@total_size = output_text.bytesize
|
283
|
-
# flagged when data exceeds the size of output file
|
284
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
285
316
|
end
|
286
317
|
|
287
|
-
|
288
|
-
if
|
289
|
-
|
318
|
+
cleanup!(output_text)
|
319
|
+
if output_text.size > 0
|
320
|
+
outfilename = File.join(@output_dir, @outfile_base + ".txt")
|
321
|
+
@fp = File.open(outfilename, "w")
|
290
322
|
@fp.puts(output_text)
|
291
|
-
output_text = ""
|
292
|
-
@total_size = 0
|
293
|
-
end_flag = false
|
294
323
|
@fp.close
|
295
|
-
@file_index += 1
|
296
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
297
|
-
@outfiles << outfilename
|
298
|
-
@fp = File.open(outfilename, "w")
|
299
|
-
next
|
300
324
|
end
|
301
|
-
|
302
|
-
end while !data_empty
|
303
|
-
|
304
|
-
if output_text != ""
|
305
|
-
cleanup!(output_text)
|
306
|
-
@fp.puts(output_text)
|
307
|
-
end
|
308
|
-
notify_parent(true)
|
309
|
-
@parent.after
|
310
|
-
@fp.close
|
311
|
-
rename(@outfiles)
|
312
|
-
@parent.msg("Processing finished", 1)
|
313
|
-
end
|
314
|
-
|
315
|
-
def extract
|
316
|
-
output_text = ""
|
317
|
-
end_flag = false
|
318
|
-
while text = get_newline
|
319
|
-
@count ||= 0;@count += 1;
|
320
|
-
@size_read ||=0;@size_read += text.bytesize
|
321
|
-
@total_size += text.bytesize
|
322
|
-
output_text << text
|
323
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
324
|
-
notify_parent
|
325
|
-
# never close the file until the end of the page even if end_flag is on
|
326
|
-
if end_flag && /<\/page/ =~ text
|
327
|
-
@fp.puts(output_text)
|
325
|
+
File.delete(@input_file) if @del_interfile
|
328
326
|
output_text = ""
|
329
|
-
@total_size = 0
|
330
|
-
end_flag = false
|
331
|
-
@fp.close
|
332
|
-
@file_index += 1
|
333
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
334
|
-
@outfiles << outfilename
|
335
|
-
@fp = File.open(outfilename, "w")
|
336
|
-
next
|
337
327
|
end
|
338
328
|
end
|
339
|
-
|
340
|
-
notify_parent(true)
|
341
|
-
@parent.after
|
342
|
-
@fp.close
|
343
|
-
rename(@outfiles)
|
344
|
-
@parent.msg("Processing finished", 1)
|
345
|
-
end
|
329
|
+
end
|
346
330
|
end
|
347
331
|
end
|
348
|
-
|
data/spec/utils_spec.rb
CHANGED
@@ -184,10 +184,6 @@ describe "Wp2txt" do
|
|
184
184
|
|
185
185
|
describe "correct_inline_template!" do
|
186
186
|
it "removes brackets and leaving some text" do
|
187
|
-
# str_before = "{{}}"
|
188
|
-
# str_after = ""
|
189
|
-
# correct_inline_template!(str_before)
|
190
|
-
# expect(str_before).to eq str_after
|
191
187
|
str_before = "{{MedalCountry | {{JPN}} }}"
|
192
188
|
str_after = "JPN"
|
193
189
|
correct_inline_template!(str_before)
|
@@ -197,11 +193,11 @@ describe "Wp2txt" do
|
|
197
193
|
correct_inline_template!(str_before)
|
198
194
|
expect(str_before).to eq str_after
|
199
195
|
str_before = "{{a|b=c|d=f}}"
|
200
|
-
str_after = "
|
196
|
+
str_after = "c"
|
201
197
|
correct_inline_template!(str_before)
|
202
198
|
expect(str_before).to eq str_after
|
203
199
|
str_before = "{{a|b|{{c|d|e}}}}"
|
204
|
-
str_after = "
|
200
|
+
str_after = "b"
|
205
201
|
correct_inline_template!(str_before)
|
206
202
|
expect(str_before).to eq str_after
|
207
203
|
str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
@@ -210,18 +206,4 @@ describe "Wp2txt" do
|
|
210
206
|
expect(str_before).to eq str_after
|
211
207
|
end
|
212
208
|
end
|
213
|
-
|
214
|
-
# describe "expand_template" do
|
215
|
-
# it "gets data corresponding to a given template using mediawiki api" do
|
216
|
-
# uri = "http://en.wiktionary.org/w/api.php"
|
217
|
-
# template = "{{en-verb}}"
|
218
|
-
# word = "kick"
|
219
|
-
# expanded = expand_template(uri, template, word)
|
220
|
-
# html =<<EOD
|
221
|
-
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
222
|
-
# EOD
|
223
|
-
# html.strip!
|
224
|
-
# expanded.should == html
|
225
|
-
# end
|
226
|
-
# end
|
227
|
-
end
|
209
|
+
end
|
data/wp2txt.gemspec
CHANGED
@@ -7,13 +7,14 @@ Gem::Specification.new do |s|
|
|
7
7
|
s.version = Wp2txt::VERSION
|
8
8
|
s.authors = ["Yoichiro Hasebe"]
|
9
9
|
s.email = ["yohasebe@gmail.com"]
|
10
|
-
s.homepage = "
|
11
|
-
s.summary = %q{
|
12
|
-
s.description = %q{WP2TXT extracts
|
10
|
+
s.homepage = "https://github.com/yohasebe/wp2txt"
|
11
|
+
s.summary = %q{A command-line toolkit to extract text content and category data from Wikipedia dump files}
|
12
|
+
s.description = %q{WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.}
|
13
13
|
|
14
14
|
s.rubyforge_project = "wp2txt"
|
15
15
|
|
16
16
|
s.files = `git ls-files`.split("\n")
|
17
|
+
s.files -= ["data/*", "image/*"]
|
17
18
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
20
|
s.require_paths = ["lib"]
|
@@ -23,7 +24,10 @@ Gem::Specification.new do |s|
|
|
23
24
|
# s.add_development_dependency "rake"
|
24
25
|
|
25
26
|
s.add_dependency "nokogiri"
|
27
|
+
s.add_dependency "ruby-progressbar"
|
26
28
|
s.add_dependency "parallel"
|
27
29
|
s.add_dependency "htmlentities"
|
28
30
|
s.add_dependency "optimist"
|
31
|
+
s.add_dependency "pastel"
|
32
|
+
s.add_dependency "tty-spinner"
|
29
33
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ruby-progressbar
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: parallel
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,12 +80,39 @@ dependencies:
|
|
66
80
|
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
69
|
-
|
70
|
-
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: pastel
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: tty-spinner
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description: WP2TXT extracts text and category data from Wikipedia dump files (encoded
|
112
|
+
in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
|
71
113
|
email:
|
72
114
|
- yohasebe@gmail.com
|
73
115
|
executables:
|
74
|
-
- benchmark.rb
|
75
116
|
- wp2txt
|
76
117
|
extensions: []
|
77
118
|
extra_rdoc_files: []
|
@@ -81,26 +122,26 @@ files:
|
|
81
122
|
- LICENSE
|
82
123
|
- README.md
|
83
124
|
- Rakefile
|
84
|
-
- bin/benchmark.rb
|
85
125
|
- bin/wp2txt
|
86
126
|
- data/output_samples/testdata_en.txt
|
87
|
-
- data/output_samples/
|
127
|
+
- data/output_samples/testdata_en_category.txt
|
88
128
|
- data/output_samples/testdata_en_summary.txt
|
89
129
|
- data/output_samples/testdata_ja.txt
|
90
|
-
- data/output_samples/
|
130
|
+
- data/output_samples/testdata_ja_category.txt
|
91
131
|
- data/output_samples/testdata_ja_summary.txt
|
92
132
|
- data/testdata_en.bz2
|
93
133
|
- data/testdata_ja.bz2
|
134
|
+
- image/screenshot.png
|
135
|
+
- image/wp2txt-logo.svg
|
136
|
+
- image/wp2txt.svg
|
94
137
|
- lib/wp2txt.rb
|
95
138
|
- lib/wp2txt/article.rb
|
96
|
-
- lib/wp2txt/mw_api.rb
|
97
|
-
- lib/wp2txt/progressbar.rb
|
98
139
|
- lib/wp2txt/utils.rb
|
99
140
|
- lib/wp2txt/version.rb
|
100
141
|
- spec/spec_helper.rb
|
101
142
|
- spec/utils_spec.rb
|
102
143
|
- wp2txt.gemspec
|
103
|
-
homepage:
|
144
|
+
homepage: https://github.com/yohasebe/wp2txt
|
104
145
|
licenses: []
|
105
146
|
metadata: {}
|
106
147
|
post_install_message:
|
@@ -121,7 +162,8 @@ requirements: []
|
|
121
162
|
rubygems_version: 3.3.7
|
122
163
|
signing_key:
|
123
164
|
specification_version: 4
|
124
|
-
summary:
|
165
|
+
summary: A command-line toolkit to extract text content and category data from Wikipedia
|
166
|
+
dump files
|
125
167
|
test_files:
|
126
168
|
- spec/spec_helper.rb
|
127
169
|
- spec/utils_spec.rb
|