wp2txt 0.9.5.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +99 -58
- data/bin/wp2txt +143 -95
- data/data/output_samples/testdata_en.txt +171 -1247
- data/data/output_samples/{testdata_en_categories.txt → testdata_en_category.txt} +1 -1
- data/data/output_samples/testdata_en_summary.txt +28 -20
- data/data/output_samples/testdata_ja.txt +10359 -17093
- data/data/output_samples/{testdata_ja_categories.txt → testdata_ja_category.txt} +30 -30
- data/data/output_samples/testdata_ja_summary.txt +36 -160
- data/image/screenshot.png +0 -0
- data/image/wp2txt-logo.svg +16 -0
- data/image/wp2txt.svg +31 -0
- data/lib/wp2txt/article.rb +1 -3
- data/lib/wp2txt/utils.rb +48 -24
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +118 -148
- data/spec/utils_spec.rb +3 -21
- data/wp2txt.gemspec +4 -0
- metadata +50 -9
- data/bin/benchmark.rb +0 -76
- data/lib/wp2txt/mw_api.rb +0 -65
- data/lib/wp2txt/progressbar.rb +0 -305
data/lib/wp2txt/utils.rb
CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
|
|
77
77
|
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
78
78
|
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
79
79
|
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
80
|
+
|
81
|
+
$complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
82
|
+
$complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
+
$complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
84
|
+
$complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
85
|
+
$complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
86
|
+
|
87
|
+
$cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
88
|
+
$cleanup_regex_02 = Regexp.new('^File:.+$')
|
89
|
+
$cleanup_regex_03 = Regexp.new('^\|.*$')
|
90
|
+
$cleanup_regex_04 = Regexp.new('\{\{.*$')
|
91
|
+
$cleanup_regex_05 = Regexp.new('^.*\}\}')
|
92
|
+
$cleanup_regex_06 = Regexp.new('\{\|.*$')
|
93
|
+
$cleanup_regex_07 = Regexp.new('^.*\|\}')
|
94
|
+
$cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
95
|
+
|
80
96
|
###################################################
|
81
97
|
|
82
98
|
module Wp2txt
|
@@ -104,11 +120,12 @@ module Wp2txt
|
|
104
120
|
end
|
105
121
|
|
106
122
|
def format_wiki!(text, has_retried = false)
|
123
|
+
remove_complex!(text)
|
124
|
+
|
107
125
|
escape_nowiki!(text)
|
108
126
|
process_interwiki_links!(text)
|
109
127
|
process_external_links!(text)
|
110
128
|
unescape_nowiki!(text)
|
111
|
-
|
112
129
|
remove_directive!(text)
|
113
130
|
remove_emphasis!(text)
|
114
131
|
mndash!(text)
|
@@ -120,15 +137,18 @@ module Wp2txt
|
|
120
137
|
end
|
121
138
|
|
122
139
|
def cleanup!(text)
|
123
|
-
text.gsub!(
|
124
|
-
text.gsub!(
|
125
|
-
text.gsub!(
|
126
|
-
text.gsub!(
|
127
|
-
text.gsub!(
|
128
|
-
text.gsub!(
|
140
|
+
text.gsub!($cleanup_regex_01){""}
|
141
|
+
text.gsub!($cleanup_regex_02){""}
|
142
|
+
text.gsub!($cleanup_regex_03){""}
|
143
|
+
text.gsub!($cleanup_regex_04){""}
|
144
|
+
text.gsub!($cleanup_regex_05){""}
|
145
|
+
text.gsub!($cleanup_regex_06){""}
|
146
|
+
text.gsub!($cleanup_regex_07){""}
|
147
|
+
text.gsub!($cleanup_regex_08){"\n\n"}
|
129
148
|
text.strip!
|
130
149
|
text << "\n\n"
|
131
150
|
end
|
151
|
+
|
132
152
|
#################### parser for nested structure ####################
|
133
153
|
|
134
154
|
def process_nested_structure(scanner, left, right, &block)
|
@@ -217,12 +237,16 @@ module Wp2txt
|
|
217
237
|
def process_external_links!(str)
|
218
238
|
scanner = StringScanner.new(str)
|
219
239
|
result = process_nested_structure(scanner, "[", "]") do |contents|
|
220
|
-
|
221
|
-
|
222
|
-
when 1
|
223
|
-
parts.first || ""
|
240
|
+
if /\A\s.+\s\z/ =~ contents
|
241
|
+
" (#{contents.strip}) "
|
224
242
|
else
|
225
|
-
parts
|
243
|
+
parts = contents.split(" ", 2)
|
244
|
+
case parts.size
|
245
|
+
when 1
|
246
|
+
parts.first || ""
|
247
|
+
else
|
248
|
+
parts.last || ""
|
249
|
+
end
|
226
250
|
end
|
227
251
|
end
|
228
252
|
str.replace(result)
|
@@ -239,10 +263,6 @@ module Wp2txt
|
|
239
263
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
240
264
|
""
|
241
265
|
end
|
242
|
-
scanner = StringScanner.new(result)
|
243
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
-
""
|
245
|
-
end
|
246
266
|
str.replace(result)
|
247
267
|
end
|
248
268
|
|
@@ -310,7 +330,8 @@ module Wp2txt
|
|
310
330
|
end
|
311
331
|
|
312
332
|
def remove_html!(str)
|
313
|
-
["
|
333
|
+
str.gsub!(/<[^<>]+\/>/){""}
|
334
|
+
["div", "gallery", "timeline", "noinclude"].each do |tag|
|
314
335
|
scanner = StringScanner.new(str)
|
315
336
|
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
337
|
""
|
@@ -320,11 +341,11 @@ module Wp2txt
|
|
320
341
|
end
|
321
342
|
|
322
343
|
def remove_complex!(str)
|
323
|
-
str.gsub!(
|
324
|
-
str.gsub!(
|
325
|
-
str.gsub!(
|
326
|
-
str.gsub!(
|
327
|
-
str.gsub!(
|
344
|
+
str.gsub!($complex_regex_01){"《#{$1}》"}
|
345
|
+
str.gsub!($complex_regex_02){""}
|
346
|
+
str.gsub!($complex_regex_03){""}
|
347
|
+
str.gsub!($complex_regex_04){""}
|
348
|
+
str.gsub!($complex_regex_05){""}
|
328
349
|
end
|
329
350
|
|
330
351
|
def make_reference!(str)
|
@@ -340,6 +361,8 @@ module Wp2txt
|
|
340
361
|
parts = contents.split("|")
|
341
362
|
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
363
|
parts.shift
|
364
|
+
elsif /\Alang\-/i =~ parts[0]
|
365
|
+
parts.shift
|
343
366
|
elsif /\Alang=/i =~ parts[1]
|
344
367
|
parts.shift
|
345
368
|
end
|
@@ -422,7 +445,7 @@ module Wp2txt
|
|
422
445
|
end
|
423
446
|
end
|
424
447
|
|
425
|
-
def rename(files)
|
448
|
+
def rename(files, ext = "txt")
|
426
449
|
# num of digits necessary to name the last file generated
|
427
450
|
maxwidth = 0
|
428
451
|
|
@@ -435,8 +458,9 @@ module Wp2txt
|
|
435
458
|
newname= f.sub(/\-(\d+)\z/) do
|
436
459
|
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
437
460
|
end
|
438
|
-
File.rename(f, newname + "
|
461
|
+
File.rename(f, newname + ".#{ext}")
|
439
462
|
end
|
463
|
+
return true
|
440
464
|
end
|
441
465
|
|
442
466
|
# convert int of seconds to string in the format 00:00:00
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -4,14 +4,8 @@
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
6
|
require "nokogiri"
|
7
|
-
require "parallel"
|
8
|
-
|
9
|
-
require 'etc'
|
10
|
-
require 'pp'
|
11
7
|
require "wp2txt/article"
|
12
8
|
require "wp2txt/utils"
|
13
|
-
require "wp2txt/progressbar"
|
14
|
-
# require "wp2txt/mw_api"
|
15
9
|
|
16
10
|
begin
|
17
11
|
require "bzip2-ruby"
|
@@ -22,21 +16,14 @@ rescue LoadError
|
|
22
16
|
end
|
23
17
|
|
24
18
|
module Wp2txt
|
25
|
-
class
|
26
|
-
|
19
|
+
class Splitter
|
27
20
|
include Wp2txt
|
28
|
-
|
29
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
|
30
|
-
@parent = parent
|
21
|
+
def initialize(input_file, output_dir = ".", tfile_size = 10)
|
31
22
|
@fp = nil
|
32
|
-
|
33
23
|
@input_file = input_file
|
34
24
|
@output_dir = output_dir
|
35
25
|
@tfile_size = tfile_size
|
36
|
-
|
37
|
-
@strip_tmarker = strip_tmarker
|
38
|
-
num_cores_available = Etc.nprocessors
|
39
|
-
@num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
|
26
|
+
prepare
|
40
27
|
end
|
41
28
|
|
42
29
|
def file_size(file)
|
@@ -62,34 +49,9 @@ module Wp2txt
|
|
62
49
|
time_elapsed = Time.now - origin
|
63
50
|
size
|
64
51
|
end
|
65
|
-
|
66
|
-
# control the display of command line progressbar (or gui which is not available for now)
|
67
|
-
def notify_parent(last = false)
|
68
|
-
@last_time ||= Time.now.to_f
|
69
|
-
@elapsed_sum ||= 0
|
70
|
-
time_now = Time.now.to_f
|
71
|
-
elapsed_from_last = (time_now - @last_time).to_i
|
72
|
-
|
73
|
-
if elapsed_from_last > 0.3 || last
|
74
52
|
|
75
|
-
|
76
|
-
@elapsed_sum += elapsed_from_last
|
77
|
-
gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i
|
78
|
-
elt_str = sec_to_str(@elapsed_sum)
|
79
|
-
if last
|
80
|
-
eta_str = "00:00:00"
|
81
|
-
else
|
82
|
-
lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0
|
83
|
-
eta_sec = (@infile_size - @size_read) / lines_persec
|
84
|
-
eta_str = sec_to_str(eta_sec)
|
85
|
-
end
|
86
|
-
@parent.prg_update(gvalue, elt_str, eta_str)
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
# check the size of input file (bz2 or plain xml) when uncompressed
|
53
|
+
# check the size of input file (bz2 or plain xml) when decompressed
|
91
54
|
def prepare
|
92
|
-
|
93
55
|
# if output_dir is not specified, output in the same directory
|
94
56
|
# as the imput file
|
95
57
|
if !@output_dir && @input_file
|
@@ -101,28 +63,12 @@ module Wp2txt
|
|
101
63
|
if /.bz2$/ =~ @input_file
|
102
64
|
unless NO_BZ2
|
103
65
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
104
|
-
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
105
|
-
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
106
|
-
@infile_size = file_size(file)
|
107
|
-
@parent.msg("... Done.", 1)
|
108
|
-
file.close
|
109
|
-
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
110
66
|
else
|
111
67
|
if RUBY_PLATFORM.index("win32")
|
112
68
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
113
69
|
else
|
114
70
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
115
71
|
end
|
116
|
-
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
117
|
-
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
118
|
-
@infile_size = file_size(file)
|
119
|
-
@parent.msg("... Done.", 1)
|
120
|
-
file.close # try to reopen since rewind method is unavailable
|
121
|
-
if RUBY_PLATFORM.index("win32")
|
122
|
-
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
123
|
-
else
|
124
|
-
file = IO.popen("bzip2 -c -d #{@input_file}")
|
125
|
-
end
|
126
72
|
end
|
127
73
|
else # meaning that it is a text file
|
128
74
|
@infile_size = File.stat(@input_file).size
|
@@ -137,8 +83,6 @@ module Wp2txt
|
|
137
83
|
@outfiles = []
|
138
84
|
@outfiles << outfilename
|
139
85
|
@fp = File.open(outfilename, "w")
|
140
|
-
@parent.before
|
141
|
-
@parent.data_set(@input_file, 100 * 100)
|
142
86
|
@file_pointer = file
|
143
87
|
return true
|
144
88
|
end
|
@@ -190,13 +134,113 @@ module Wp2txt
|
|
190
134
|
end
|
191
135
|
end
|
192
136
|
|
137
|
+
def split_file
|
138
|
+
output_text = ""
|
139
|
+
end_flag = false
|
140
|
+
while text = get_newline
|
141
|
+
@count ||= 0;@count += 1;
|
142
|
+
@size_read ||=0
|
143
|
+
@size_read += text.bytesize
|
144
|
+
@total_size += text.bytesize
|
145
|
+
output_text << text
|
146
|
+
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
147
|
+
# never close the file until the end of the page even if end_flag is on
|
148
|
+
if end_flag && /<\/page/ =~ text
|
149
|
+
@fp.puts(output_text)
|
150
|
+
output_text = ""
|
151
|
+
@total_size = 0
|
152
|
+
end_flag = false
|
153
|
+
@fp.close
|
154
|
+
@file_index += 1
|
155
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
156
|
+
@outfiles << outfilename
|
157
|
+
@fp = File.open(outfilename, "w")
|
158
|
+
next
|
159
|
+
end
|
160
|
+
end
|
161
|
+
@fp.puts(output_text) if output_text != ""
|
162
|
+
@fp.close
|
163
|
+
|
164
|
+
if File.size(outfilename) == 0
|
165
|
+
File.delete(outfilename)
|
166
|
+
@outfiles.delete(outfilename)
|
167
|
+
end
|
168
|
+
|
169
|
+
rename(@outfiles, "xml")
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
class Runner
|
174
|
+
include Wp2txt
|
175
|
+
|
176
|
+
def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
|
177
|
+
@fp = nil
|
178
|
+
@input_file = input_file
|
179
|
+
@output_dir = output_dir
|
180
|
+
@strip_tmarker = strip_tmarker
|
181
|
+
@del_interfile = del_interfile
|
182
|
+
prepare
|
183
|
+
end
|
184
|
+
|
185
|
+
def prepare
|
186
|
+
@infile_size = File.stat(@input_file).size
|
187
|
+
file = open(@input_file)
|
188
|
+
@file_pointer = file
|
189
|
+
@outfile_base = File.basename(@input_file, ".*")
|
190
|
+
@total_size = 0
|
191
|
+
return true
|
192
|
+
end
|
193
|
+
|
194
|
+
def fill_buffer
|
195
|
+
while true do
|
196
|
+
begin
|
197
|
+
new_lines = @file_pointer.read(10485760)
|
198
|
+
rescue => e
|
199
|
+
return nil
|
200
|
+
end
|
201
|
+
return nil unless new_lines
|
202
|
+
|
203
|
+
# temp_buf is filled with text split by "\n"
|
204
|
+
temp_buf = []
|
205
|
+
ss = StringScanner.new(new_lines)
|
206
|
+
while ss.scan(/.*?\n/m)
|
207
|
+
temp_buf << ss[0]
|
208
|
+
end
|
209
|
+
temp_buf << ss.rest unless ss.eos?
|
210
|
+
|
211
|
+
new_first_line = temp_buf.shift
|
212
|
+
if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
213
|
+
@buffer.last << new_first_line
|
214
|
+
@buffer << ""
|
215
|
+
else
|
216
|
+
@buffer.last << new_first_line
|
217
|
+
end
|
218
|
+
@buffer += temp_buf unless temp_buf.empty?
|
219
|
+
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
220
|
+
@buffer << ""
|
221
|
+
end
|
222
|
+
break if @buffer.size > 1
|
223
|
+
end
|
224
|
+
return true
|
225
|
+
end
|
226
|
+
|
227
|
+
def get_newline
|
228
|
+
@buffer ||= [""]
|
229
|
+
if @buffer.size == 1
|
230
|
+
return nil unless fill_buffer
|
231
|
+
end
|
232
|
+
if @buffer.empty?
|
233
|
+
return nil
|
234
|
+
else
|
235
|
+
new_line = @buffer.shift
|
236
|
+
return new_line
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
193
240
|
def get_page
|
194
241
|
inside_page = false
|
195
242
|
page = ""
|
196
243
|
while line = get_newline
|
197
|
-
notify_parent
|
198
|
-
@size_read ||=0; @size_read += line.bytesize
|
199
|
-
|
200
244
|
if /<page>/ =~ line #
|
201
245
|
page << line
|
202
246
|
inside_page = true
|
@@ -215,22 +259,7 @@ module Wp2txt
|
|
215
259
|
end
|
216
260
|
end
|
217
261
|
|
218
|
-
# call this method to do the job
|
219
262
|
def extract_text(&block)
|
220
|
-
prepare
|
221
|
-
if @convert
|
222
|
-
if block
|
223
|
-
extract_and_convert(&block)
|
224
|
-
else
|
225
|
-
extract_and_convert
|
226
|
-
end
|
227
|
-
else
|
228
|
-
# output the original xml only split to files of the specified size
|
229
|
-
extract
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
def extract_and_convert(&block)
|
234
263
|
in_text = false
|
235
264
|
in_message = false
|
236
265
|
result_text = ""
|
@@ -241,17 +270,15 @@ module Wp2txt
|
|
241
270
|
pages = []
|
242
271
|
data_empty = false
|
243
272
|
|
244
|
-
|
273
|
+
while !data_empty
|
245
274
|
page = get_page
|
246
275
|
if page
|
247
276
|
pages << page
|
248
277
|
else
|
249
278
|
data_empty = true
|
250
279
|
end
|
251
|
-
if data_empty
|
252
|
-
|
253
|
-
pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
254
|
-
page_text = {:order => n, :data => nil}
|
280
|
+
if data_empty
|
281
|
+
pages.each do |page|
|
255
282
|
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
256
283
|
xml = xmlns + page + "</mediawiki>"
|
257
284
|
|
@@ -270,79 +297,22 @@ module Wp2txt
|
|
270
297
|
end
|
271
298
|
end
|
272
299
|
article = Article.new(text, title, @strip_tmarker)
|
273
|
-
page_text
|
300
|
+
page_text = block.call(article)
|
301
|
+
output_text << page_text
|
274
302
|
end
|
275
|
-
page_text
|
276
|
-
end
|
277
|
-
pages.clear
|
278
|
-
pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
|
279
|
-
pages_text.each do |page_text|
|
280
|
-
output_text << page_text
|
281
|
-
@count ||= 0; @count += 1;
|
282
|
-
@total_size = output_text.bytesize
|
283
|
-
# flagged when data exceeds the size of output file
|
284
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
285
303
|
end
|
286
304
|
|
287
|
-
|
288
|
-
if
|
289
|
-
|
305
|
+
cleanup!(output_text)
|
306
|
+
if output_text.size > 0
|
307
|
+
outfilename = File.join(@output_dir, @outfile_base + ".txt")
|
308
|
+
@fp = File.open(outfilename, "w")
|
290
309
|
@fp.puts(output_text)
|
291
|
-
output_text = ""
|
292
|
-
@total_size = 0
|
293
|
-
end_flag = false
|
294
310
|
@fp.close
|
295
|
-
@file_index += 1
|
296
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
297
|
-
@outfiles << outfilename
|
298
|
-
@fp = File.open(outfilename, "w")
|
299
|
-
next
|
300
311
|
end
|
301
|
-
|
302
|
-
end while !data_empty
|
303
|
-
|
304
|
-
if output_text != ""
|
305
|
-
cleanup!(output_text)
|
306
|
-
@fp.puts(output_text)
|
307
|
-
end
|
308
|
-
notify_parent(true)
|
309
|
-
@parent.after
|
310
|
-
@fp.close
|
311
|
-
rename(@outfiles)
|
312
|
-
@parent.msg("Processing finished", 1)
|
313
|
-
end
|
314
|
-
|
315
|
-
def extract
|
316
|
-
output_text = ""
|
317
|
-
end_flag = false
|
318
|
-
while text = get_newline
|
319
|
-
@count ||= 0;@count += 1;
|
320
|
-
@size_read ||=0;@size_read += text.bytesize
|
321
|
-
@total_size += text.bytesize
|
322
|
-
output_text << text
|
323
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
324
|
-
notify_parent
|
325
|
-
# never close the file until the end of the page even if end_flag is on
|
326
|
-
if end_flag && /<\/page/ =~ text
|
327
|
-
@fp.puts(output_text)
|
312
|
+
File.delete(@input_file) if @del_interfile
|
328
313
|
output_text = ""
|
329
|
-
@total_size = 0
|
330
|
-
end_flag = false
|
331
|
-
@fp.close
|
332
|
-
@file_index += 1
|
333
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
334
|
-
@outfiles << outfilename
|
335
|
-
@fp = File.open(outfilename, "w")
|
336
|
-
next
|
337
314
|
end
|
338
315
|
end
|
339
|
-
|
340
|
-
notify_parent(true)
|
341
|
-
@parent.after
|
342
|
-
@fp.close
|
343
|
-
rename(@outfiles)
|
344
|
-
@parent.msg("Processing finished", 1)
|
345
|
-
end
|
316
|
+
end
|
346
317
|
end
|
347
318
|
end
|
348
|
-
|
data/spec/utils_spec.rb
CHANGED
@@ -184,10 +184,6 @@ describe "Wp2txt" do
|
|
184
184
|
|
185
185
|
describe "correct_inline_template!" do
|
186
186
|
it "removes brackets and leaving some text" do
|
187
|
-
# str_before = "{{}}"
|
188
|
-
# str_after = ""
|
189
|
-
# correct_inline_template!(str_before)
|
190
|
-
# expect(str_before).to eq str_after
|
191
187
|
str_before = "{{MedalCountry | {{JPN}} }}"
|
192
188
|
str_after = "JPN"
|
193
189
|
correct_inline_template!(str_before)
|
@@ -197,11 +193,11 @@ describe "Wp2txt" do
|
|
197
193
|
correct_inline_template!(str_before)
|
198
194
|
expect(str_before).to eq str_after
|
199
195
|
str_before = "{{a|b=c|d=f}}"
|
200
|
-
str_after = "
|
196
|
+
str_after = "c"
|
201
197
|
correct_inline_template!(str_before)
|
202
198
|
expect(str_before).to eq str_after
|
203
199
|
str_before = "{{a|b|{{c|d|e}}}}"
|
204
|
-
str_after = "
|
200
|
+
str_after = "b"
|
205
201
|
correct_inline_template!(str_before)
|
206
202
|
expect(str_before).to eq str_after
|
207
203
|
str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
@@ -210,18 +206,4 @@ describe "Wp2txt" do
|
|
210
206
|
expect(str_before).to eq str_after
|
211
207
|
end
|
212
208
|
end
|
213
|
-
|
214
|
-
# describe "expand_template" do
|
215
|
-
# it "gets data corresponding to a given template using mediawiki api" do
|
216
|
-
# uri = "http://en.wiktionary.org/w/api.php"
|
217
|
-
# template = "{{en-verb}}"
|
218
|
-
# word = "kick"
|
219
|
-
# expanded = expand_template(uri, template, word)
|
220
|
-
# html =<<EOD
|
221
|
-
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
222
|
-
# EOD
|
223
|
-
# html.strip!
|
224
|
-
# expanded.should == html
|
225
|
-
# end
|
226
|
-
# end
|
227
|
-
end
|
209
|
+
end
|
data/wp2txt.gemspec
CHANGED
@@ -14,6 +14,7 @@ Gem::Specification.new do |s|
|
|
14
14
|
s.rubyforge_project = "wp2txt"
|
15
15
|
|
16
16
|
s.files = `git ls-files`.split("\n")
|
17
|
+
s.files -= ["data/*", "image/*"]
|
17
18
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
20
|
s.require_paths = ["lib"]
|
@@ -23,7 +24,10 @@ Gem::Specification.new do |s|
|
|
23
24
|
# s.add_development_dependency "rake"
|
24
25
|
|
25
26
|
s.add_dependency "nokogiri"
|
27
|
+
s.add_dependency "ruby-progressbar"
|
26
28
|
s.add_dependency "parallel"
|
27
29
|
s.add_dependency "htmlentities"
|
28
30
|
s.add_dependency "optimist"
|
31
|
+
s.add_dependency "pastel"
|
32
|
+
s.add_dependency "tty-spinner"
|
29
33
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ruby-progressbar
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: parallel
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,12 +80,39 @@ dependencies:
|
|
66
80
|
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: pastel
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: tty-spinner
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
69
111
|
description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
|
70
112
|
XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
71
113
|
email:
|
72
114
|
- yohasebe@gmail.com
|
73
115
|
executables:
|
74
|
-
- benchmark.rb
|
75
116
|
- wp2txt
|
76
117
|
extensions: []
|
77
118
|
extra_rdoc_files: []
|
@@ -81,20 +122,20 @@ files:
|
|
81
122
|
- LICENSE
|
82
123
|
- README.md
|
83
124
|
- Rakefile
|
84
|
-
- bin/benchmark.rb
|
85
125
|
- bin/wp2txt
|
86
126
|
- data/output_samples/testdata_en.txt
|
87
|
-
- data/output_samples/
|
127
|
+
- data/output_samples/testdata_en_category.txt
|
88
128
|
- data/output_samples/testdata_en_summary.txt
|
89
129
|
- data/output_samples/testdata_ja.txt
|
90
|
-
- data/output_samples/
|
130
|
+
- data/output_samples/testdata_ja_category.txt
|
91
131
|
- data/output_samples/testdata_ja_summary.txt
|
92
132
|
- data/testdata_en.bz2
|
93
133
|
- data/testdata_ja.bz2
|
134
|
+
- image/screenshot.png
|
135
|
+
- image/wp2txt-logo.svg
|
136
|
+
- image/wp2txt.svg
|
94
137
|
- lib/wp2txt.rb
|
95
138
|
- lib/wp2txt/article.rb
|
96
|
-
- lib/wp2txt/mw_api.rb
|
97
|
-
- lib/wp2txt/progressbar.rb
|
98
139
|
- lib/wp2txt/utils.rb
|
99
140
|
- lib/wp2txt/version.rb
|
100
141
|
- spec/spec_helper.rb
|
@@ -118,7 +159,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
159
|
- !ruby/object:Gem::Version
|
119
160
|
version: '0'
|
120
161
|
requirements: []
|
121
|
-
rubygems_version: 3.3.
|
162
|
+
rubygems_version: 3.3.3
|
122
163
|
signing_key:
|
123
164
|
specification_version: 4
|
124
165
|
summary: Wikipedia dump to text converter
|