wp2txt 0.9.5.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +99 -58
- data/bin/wp2txt +143 -95
- data/data/output_samples/testdata_en.txt +171 -1247
- data/data/output_samples/{testdata_en_categories.txt → testdata_en_category.txt} +1 -1
- data/data/output_samples/testdata_en_summary.txt +28 -20
- data/data/output_samples/testdata_ja.txt +10359 -17093
- data/data/output_samples/{testdata_ja_categories.txt → testdata_ja_category.txt} +30 -30
- data/data/output_samples/testdata_ja_summary.txt +36 -160
- data/image/screenshot.png +0 -0
- data/image/wp2txt-logo.svg +16 -0
- data/image/wp2txt.svg +31 -0
- data/lib/wp2txt/article.rb +1 -3
- data/lib/wp2txt/utils.rb +48 -24
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +118 -148
- data/spec/utils_spec.rb +3 -21
- data/wp2txt.gemspec +4 -0
- metadata +50 -9
- data/bin/benchmark.rb +0 -76
- data/lib/wp2txt/mw_api.rb +0 -65
- data/lib/wp2txt/progressbar.rb +0 -305
data/lib/wp2txt/utils.rb
CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
|
|
77
77
|
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
78
78
|
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
79
79
|
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
80
|
+
|
81
|
+
$complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
82
|
+
$complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
+
$complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
84
|
+
$complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
85
|
+
$complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
86
|
+
|
87
|
+
$cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
88
|
+
$cleanup_regex_02 = Regexp.new('^File:.+$')
|
89
|
+
$cleanup_regex_03 = Regexp.new('^\|.*$')
|
90
|
+
$cleanup_regex_04 = Regexp.new('\{\{.*$')
|
91
|
+
$cleanup_regex_05 = Regexp.new('^.*\}\}')
|
92
|
+
$cleanup_regex_06 = Regexp.new('\{\|.*$')
|
93
|
+
$cleanup_regex_07 = Regexp.new('^.*\|\}')
|
94
|
+
$cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
95
|
+
|
80
96
|
###################################################
|
81
97
|
|
82
98
|
module Wp2txt
|
@@ -104,11 +120,12 @@ module Wp2txt
|
|
104
120
|
end
|
105
121
|
|
106
122
|
def format_wiki!(text, has_retried = false)
|
123
|
+
remove_complex!(text)
|
124
|
+
|
107
125
|
escape_nowiki!(text)
|
108
126
|
process_interwiki_links!(text)
|
109
127
|
process_external_links!(text)
|
110
128
|
unescape_nowiki!(text)
|
111
|
-
|
112
129
|
remove_directive!(text)
|
113
130
|
remove_emphasis!(text)
|
114
131
|
mndash!(text)
|
@@ -120,15 +137,18 @@ module Wp2txt
|
|
120
137
|
end
|
121
138
|
|
122
139
|
def cleanup!(text)
|
123
|
-
text.gsub!(
|
124
|
-
text.gsub!(
|
125
|
-
text.gsub!(
|
126
|
-
text.gsub!(
|
127
|
-
text.gsub!(
|
128
|
-
text.gsub!(
|
140
|
+
text.gsub!($cleanup_regex_01){""}
|
141
|
+
text.gsub!($cleanup_regex_02){""}
|
142
|
+
text.gsub!($cleanup_regex_03){""}
|
143
|
+
text.gsub!($cleanup_regex_04){""}
|
144
|
+
text.gsub!($cleanup_regex_05){""}
|
145
|
+
text.gsub!($cleanup_regex_06){""}
|
146
|
+
text.gsub!($cleanup_regex_07){""}
|
147
|
+
text.gsub!($cleanup_regex_08){"\n\n"}
|
129
148
|
text.strip!
|
130
149
|
text << "\n\n"
|
131
150
|
end
|
151
|
+
|
132
152
|
#################### parser for nested structure ####################
|
133
153
|
|
134
154
|
def process_nested_structure(scanner, left, right, &block)
|
@@ -217,12 +237,16 @@ module Wp2txt
|
|
217
237
|
def process_external_links!(str)
|
218
238
|
scanner = StringScanner.new(str)
|
219
239
|
result = process_nested_structure(scanner, "[", "]") do |contents|
|
220
|
-
|
221
|
-
|
222
|
-
when 1
|
223
|
-
parts.first || ""
|
240
|
+
if /\A\s.+\s\z/ =~ contents
|
241
|
+
" (#{contents.strip}) "
|
224
242
|
else
|
225
|
-
parts
|
243
|
+
parts = contents.split(" ", 2)
|
244
|
+
case parts.size
|
245
|
+
when 1
|
246
|
+
parts.first || ""
|
247
|
+
else
|
248
|
+
parts.last || ""
|
249
|
+
end
|
226
250
|
end
|
227
251
|
end
|
228
252
|
str.replace(result)
|
@@ -239,10 +263,6 @@ module Wp2txt
|
|
239
263
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
240
264
|
""
|
241
265
|
end
|
242
|
-
scanner = StringScanner.new(result)
|
243
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
-
""
|
245
|
-
end
|
246
266
|
str.replace(result)
|
247
267
|
end
|
248
268
|
|
@@ -310,7 +330,8 @@ module Wp2txt
|
|
310
330
|
end
|
311
331
|
|
312
332
|
def remove_html!(str)
|
313
|
-
["
|
333
|
+
str.gsub!(/<[^<>]+\/>/){""}
|
334
|
+
["div", "gallery", "timeline", "noinclude"].each do |tag|
|
314
335
|
scanner = StringScanner.new(str)
|
315
336
|
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
337
|
""
|
@@ -320,11 +341,11 @@ module Wp2txt
|
|
320
341
|
end
|
321
342
|
|
322
343
|
def remove_complex!(str)
|
323
|
-
str.gsub!(
|
324
|
-
str.gsub!(
|
325
|
-
str.gsub!(
|
326
|
-
str.gsub!(
|
327
|
-
str.gsub!(
|
344
|
+
str.gsub!($complex_regex_01){"《#{$1}》"}
|
345
|
+
str.gsub!($complex_regex_02){""}
|
346
|
+
str.gsub!($complex_regex_03){""}
|
347
|
+
str.gsub!($complex_regex_04){""}
|
348
|
+
str.gsub!($complex_regex_05){""}
|
328
349
|
end
|
329
350
|
|
330
351
|
def make_reference!(str)
|
@@ -340,6 +361,8 @@ module Wp2txt
|
|
340
361
|
parts = contents.split("|")
|
341
362
|
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
363
|
parts.shift
|
364
|
+
elsif /\Alang\-/i =~ parts[0]
|
365
|
+
parts.shift
|
343
366
|
elsif /\Alang=/i =~ parts[1]
|
344
367
|
parts.shift
|
345
368
|
end
|
@@ -422,7 +445,7 @@ module Wp2txt
|
|
422
445
|
end
|
423
446
|
end
|
424
447
|
|
425
|
-
def rename(files)
|
448
|
+
def rename(files, ext = "txt")
|
426
449
|
# num of digits necessary to name the last file generated
|
427
450
|
maxwidth = 0
|
428
451
|
|
@@ -435,8 +458,9 @@ module Wp2txt
|
|
435
458
|
newname= f.sub(/\-(\d+)\z/) do
|
436
459
|
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
437
460
|
end
|
438
|
-
File.rename(f, newname + "
|
461
|
+
File.rename(f, newname + ".#{ext}")
|
439
462
|
end
|
463
|
+
return true
|
440
464
|
end
|
441
465
|
|
442
466
|
# convert int of seconds to string in the format 00:00:00
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -4,14 +4,8 @@
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
6
|
require "nokogiri"
|
7
|
-
require "parallel"
|
8
|
-
|
9
|
-
require 'etc'
|
10
|
-
require 'pp'
|
11
7
|
require "wp2txt/article"
|
12
8
|
require "wp2txt/utils"
|
13
|
-
require "wp2txt/progressbar"
|
14
|
-
# require "wp2txt/mw_api"
|
15
9
|
|
16
10
|
begin
|
17
11
|
require "bzip2-ruby"
|
@@ -22,21 +16,14 @@ rescue LoadError
|
|
22
16
|
end
|
23
17
|
|
24
18
|
module Wp2txt
|
25
|
-
class
|
26
|
-
|
19
|
+
class Splitter
|
27
20
|
include Wp2txt
|
28
|
-
|
29
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
|
30
|
-
@parent = parent
|
21
|
+
def initialize(input_file, output_dir = ".", tfile_size = 10)
|
31
22
|
@fp = nil
|
32
|
-
|
33
23
|
@input_file = input_file
|
34
24
|
@output_dir = output_dir
|
35
25
|
@tfile_size = tfile_size
|
36
|
-
|
37
|
-
@strip_tmarker = strip_tmarker
|
38
|
-
num_cores_available = Etc.nprocessors
|
39
|
-
@num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
|
26
|
+
prepare
|
40
27
|
end
|
41
28
|
|
42
29
|
def file_size(file)
|
@@ -62,34 +49,9 @@ module Wp2txt
|
|
62
49
|
time_elapsed = Time.now - origin
|
63
50
|
size
|
64
51
|
end
|
65
|
-
|
66
|
-
# control the display of command line progressbar (or gui which is not available for now)
|
67
|
-
def notify_parent(last = false)
|
68
|
-
@last_time ||= Time.now.to_f
|
69
|
-
@elapsed_sum ||= 0
|
70
|
-
time_now = Time.now.to_f
|
71
|
-
elapsed_from_last = (time_now - @last_time).to_i
|
72
|
-
|
73
|
-
if elapsed_from_last > 0.3 || last
|
74
52
|
|
75
|
-
|
76
|
-
@elapsed_sum += elapsed_from_last
|
77
|
-
gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i
|
78
|
-
elt_str = sec_to_str(@elapsed_sum)
|
79
|
-
if last
|
80
|
-
eta_str = "00:00:00"
|
81
|
-
else
|
82
|
-
lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0
|
83
|
-
eta_sec = (@infile_size - @size_read) / lines_persec
|
84
|
-
eta_str = sec_to_str(eta_sec)
|
85
|
-
end
|
86
|
-
@parent.prg_update(gvalue, elt_str, eta_str)
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
# check the size of input file (bz2 or plain xml) when uncompressed
|
53
|
+
# check the size of input file (bz2 or plain xml) when decompressed
|
91
54
|
def prepare
|
92
|
-
|
93
55
|
# if output_dir is not specified, output in the same directory
|
94
56
|
# as the imput file
|
95
57
|
if !@output_dir && @input_file
|
@@ -101,28 +63,12 @@ module Wp2txt
|
|
101
63
|
if /.bz2$/ =~ @input_file
|
102
64
|
unless NO_BZ2
|
103
65
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
104
|
-
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
105
|
-
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
106
|
-
@infile_size = file_size(file)
|
107
|
-
@parent.msg("... Done.", 1)
|
108
|
-
file.close
|
109
|
-
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
110
66
|
else
|
111
67
|
if RUBY_PLATFORM.index("win32")
|
112
68
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
113
69
|
else
|
114
70
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
115
71
|
end
|
116
|
-
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
117
|
-
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
118
|
-
@infile_size = file_size(file)
|
119
|
-
@parent.msg("... Done.", 1)
|
120
|
-
file.close # try to reopen since rewind method is unavailable
|
121
|
-
if RUBY_PLATFORM.index("win32")
|
122
|
-
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
123
|
-
else
|
124
|
-
file = IO.popen("bzip2 -c -d #{@input_file}")
|
125
|
-
end
|
126
72
|
end
|
127
73
|
else # meaning that it is a text file
|
128
74
|
@infile_size = File.stat(@input_file).size
|
@@ -137,8 +83,6 @@ module Wp2txt
|
|
137
83
|
@outfiles = []
|
138
84
|
@outfiles << outfilename
|
139
85
|
@fp = File.open(outfilename, "w")
|
140
|
-
@parent.before
|
141
|
-
@parent.data_set(@input_file, 100 * 100)
|
142
86
|
@file_pointer = file
|
143
87
|
return true
|
144
88
|
end
|
@@ -190,13 +134,113 @@ module Wp2txt
|
|
190
134
|
end
|
191
135
|
end
|
192
136
|
|
137
|
+
def split_file
|
138
|
+
output_text = ""
|
139
|
+
end_flag = false
|
140
|
+
while text = get_newline
|
141
|
+
@count ||= 0;@count += 1;
|
142
|
+
@size_read ||=0
|
143
|
+
@size_read += text.bytesize
|
144
|
+
@total_size += text.bytesize
|
145
|
+
output_text << text
|
146
|
+
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
147
|
+
# never close the file until the end of the page even if end_flag is on
|
148
|
+
if end_flag && /<\/page/ =~ text
|
149
|
+
@fp.puts(output_text)
|
150
|
+
output_text = ""
|
151
|
+
@total_size = 0
|
152
|
+
end_flag = false
|
153
|
+
@fp.close
|
154
|
+
@file_index += 1
|
155
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
156
|
+
@outfiles << outfilename
|
157
|
+
@fp = File.open(outfilename, "w")
|
158
|
+
next
|
159
|
+
end
|
160
|
+
end
|
161
|
+
@fp.puts(output_text) if output_text != ""
|
162
|
+
@fp.close
|
163
|
+
|
164
|
+
if File.size(outfilename) == 0
|
165
|
+
File.delete(outfilename)
|
166
|
+
@outfiles.delete(outfilename)
|
167
|
+
end
|
168
|
+
|
169
|
+
rename(@outfiles, "xml")
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
class Runner
|
174
|
+
include Wp2txt
|
175
|
+
|
176
|
+
def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
|
177
|
+
@fp = nil
|
178
|
+
@input_file = input_file
|
179
|
+
@output_dir = output_dir
|
180
|
+
@strip_tmarker = strip_tmarker
|
181
|
+
@del_interfile = del_interfile
|
182
|
+
prepare
|
183
|
+
end
|
184
|
+
|
185
|
+
def prepare
|
186
|
+
@infile_size = File.stat(@input_file).size
|
187
|
+
file = open(@input_file)
|
188
|
+
@file_pointer = file
|
189
|
+
@outfile_base = File.basename(@input_file, ".*")
|
190
|
+
@total_size = 0
|
191
|
+
return true
|
192
|
+
end
|
193
|
+
|
194
|
+
def fill_buffer
|
195
|
+
while true do
|
196
|
+
begin
|
197
|
+
new_lines = @file_pointer.read(10485760)
|
198
|
+
rescue => e
|
199
|
+
return nil
|
200
|
+
end
|
201
|
+
return nil unless new_lines
|
202
|
+
|
203
|
+
# temp_buf is filled with text split by "\n"
|
204
|
+
temp_buf = []
|
205
|
+
ss = StringScanner.new(new_lines)
|
206
|
+
while ss.scan(/.*?\n/m)
|
207
|
+
temp_buf << ss[0]
|
208
|
+
end
|
209
|
+
temp_buf << ss.rest unless ss.eos?
|
210
|
+
|
211
|
+
new_first_line = temp_buf.shift
|
212
|
+
if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
213
|
+
@buffer.last << new_first_line
|
214
|
+
@buffer << ""
|
215
|
+
else
|
216
|
+
@buffer.last << new_first_line
|
217
|
+
end
|
218
|
+
@buffer += temp_buf unless temp_buf.empty?
|
219
|
+
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
220
|
+
@buffer << ""
|
221
|
+
end
|
222
|
+
break if @buffer.size > 1
|
223
|
+
end
|
224
|
+
return true
|
225
|
+
end
|
226
|
+
|
227
|
+
def get_newline
|
228
|
+
@buffer ||= [""]
|
229
|
+
if @buffer.size == 1
|
230
|
+
return nil unless fill_buffer
|
231
|
+
end
|
232
|
+
if @buffer.empty?
|
233
|
+
return nil
|
234
|
+
else
|
235
|
+
new_line = @buffer.shift
|
236
|
+
return new_line
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
193
240
|
def get_page
|
194
241
|
inside_page = false
|
195
242
|
page = ""
|
196
243
|
while line = get_newline
|
197
|
-
notify_parent
|
198
|
-
@size_read ||=0; @size_read += line.bytesize
|
199
|
-
|
200
244
|
if /<page>/ =~ line #
|
201
245
|
page << line
|
202
246
|
inside_page = true
|
@@ -215,22 +259,7 @@ module Wp2txt
|
|
215
259
|
end
|
216
260
|
end
|
217
261
|
|
218
|
-
# call this method to do the job
|
219
262
|
def extract_text(&block)
|
220
|
-
prepare
|
221
|
-
if @convert
|
222
|
-
if block
|
223
|
-
extract_and_convert(&block)
|
224
|
-
else
|
225
|
-
extract_and_convert
|
226
|
-
end
|
227
|
-
else
|
228
|
-
# output the original xml only split to files of the specified size
|
229
|
-
extract
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
def extract_and_convert(&block)
|
234
263
|
in_text = false
|
235
264
|
in_message = false
|
236
265
|
result_text = ""
|
@@ -241,17 +270,15 @@ module Wp2txt
|
|
241
270
|
pages = []
|
242
271
|
data_empty = false
|
243
272
|
|
244
|
-
|
273
|
+
while !data_empty
|
245
274
|
page = get_page
|
246
275
|
if page
|
247
276
|
pages << page
|
248
277
|
else
|
249
278
|
data_empty = true
|
250
279
|
end
|
251
|
-
if data_empty
|
252
|
-
|
253
|
-
pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
254
|
-
page_text = {:order => n, :data => nil}
|
280
|
+
if data_empty
|
281
|
+
pages.each do |page|
|
255
282
|
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
256
283
|
xml = xmlns + page + "</mediawiki>"
|
257
284
|
|
@@ -270,79 +297,22 @@ module Wp2txt
|
|
270
297
|
end
|
271
298
|
end
|
272
299
|
article = Article.new(text, title, @strip_tmarker)
|
273
|
-
page_text
|
300
|
+
page_text = block.call(article)
|
301
|
+
output_text << page_text
|
274
302
|
end
|
275
|
-
page_text
|
276
|
-
end
|
277
|
-
pages.clear
|
278
|
-
pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
|
279
|
-
pages_text.each do |page_text|
|
280
|
-
output_text << page_text
|
281
|
-
@count ||= 0; @count += 1;
|
282
|
-
@total_size = output_text.bytesize
|
283
|
-
# flagged when data exceeds the size of output file
|
284
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
285
303
|
end
|
286
304
|
|
287
|
-
|
288
|
-
if
|
289
|
-
|
305
|
+
cleanup!(output_text)
|
306
|
+
if output_text.size > 0
|
307
|
+
outfilename = File.join(@output_dir, @outfile_base + ".txt")
|
308
|
+
@fp = File.open(outfilename, "w")
|
290
309
|
@fp.puts(output_text)
|
291
|
-
output_text = ""
|
292
|
-
@total_size = 0
|
293
|
-
end_flag = false
|
294
310
|
@fp.close
|
295
|
-
@file_index += 1
|
296
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
297
|
-
@outfiles << outfilename
|
298
|
-
@fp = File.open(outfilename, "w")
|
299
|
-
next
|
300
311
|
end
|
301
|
-
|
302
|
-
end while !data_empty
|
303
|
-
|
304
|
-
if output_text != ""
|
305
|
-
cleanup!(output_text)
|
306
|
-
@fp.puts(output_text)
|
307
|
-
end
|
308
|
-
notify_parent(true)
|
309
|
-
@parent.after
|
310
|
-
@fp.close
|
311
|
-
rename(@outfiles)
|
312
|
-
@parent.msg("Processing finished", 1)
|
313
|
-
end
|
314
|
-
|
315
|
-
def extract
|
316
|
-
output_text = ""
|
317
|
-
end_flag = false
|
318
|
-
while text = get_newline
|
319
|
-
@count ||= 0;@count += 1;
|
320
|
-
@size_read ||=0;@size_read += text.bytesize
|
321
|
-
@total_size += text.bytesize
|
322
|
-
output_text << text
|
323
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
324
|
-
notify_parent
|
325
|
-
# never close the file until the end of the page even if end_flag is on
|
326
|
-
if end_flag && /<\/page/ =~ text
|
327
|
-
@fp.puts(output_text)
|
312
|
+
File.delete(@input_file) if @del_interfile
|
328
313
|
output_text = ""
|
329
|
-
@total_size = 0
|
330
|
-
end_flag = false
|
331
|
-
@fp.close
|
332
|
-
@file_index += 1
|
333
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
334
|
-
@outfiles << outfilename
|
335
|
-
@fp = File.open(outfilename, "w")
|
336
|
-
next
|
337
314
|
end
|
338
315
|
end
|
339
|
-
|
340
|
-
notify_parent(true)
|
341
|
-
@parent.after
|
342
|
-
@fp.close
|
343
|
-
rename(@outfiles)
|
344
|
-
@parent.msg("Processing finished", 1)
|
345
|
-
end
|
316
|
+
end
|
346
317
|
end
|
347
318
|
end
|
348
|
-
|
data/spec/utils_spec.rb
CHANGED
@@ -184,10 +184,6 @@ describe "Wp2txt" do
|
|
184
184
|
|
185
185
|
describe "correct_inline_template!" do
|
186
186
|
it "removes brackets and leaving some text" do
|
187
|
-
# str_before = "{{}}"
|
188
|
-
# str_after = ""
|
189
|
-
# correct_inline_template!(str_before)
|
190
|
-
# expect(str_before).to eq str_after
|
191
187
|
str_before = "{{MedalCountry | {{JPN}} }}"
|
192
188
|
str_after = "JPN"
|
193
189
|
correct_inline_template!(str_before)
|
@@ -197,11 +193,11 @@ describe "Wp2txt" do
|
|
197
193
|
correct_inline_template!(str_before)
|
198
194
|
expect(str_before).to eq str_after
|
199
195
|
str_before = "{{a|b=c|d=f}}"
|
200
|
-
str_after = "
|
196
|
+
str_after = "c"
|
201
197
|
correct_inline_template!(str_before)
|
202
198
|
expect(str_before).to eq str_after
|
203
199
|
str_before = "{{a|b|{{c|d|e}}}}"
|
204
|
-
str_after = "
|
200
|
+
str_after = "b"
|
205
201
|
correct_inline_template!(str_before)
|
206
202
|
expect(str_before).to eq str_after
|
207
203
|
str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
@@ -210,18 +206,4 @@ describe "Wp2txt" do
|
|
210
206
|
expect(str_before).to eq str_after
|
211
207
|
end
|
212
208
|
end
|
213
|
-
|
214
|
-
# describe "expand_template" do
|
215
|
-
# it "gets data corresponding to a given template using mediawiki api" do
|
216
|
-
# uri = "http://en.wiktionary.org/w/api.php"
|
217
|
-
# template = "{{en-verb}}"
|
218
|
-
# word = "kick"
|
219
|
-
# expanded = expand_template(uri, template, word)
|
220
|
-
# html =<<EOD
|
221
|
-
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
222
|
-
# EOD
|
223
|
-
# html.strip!
|
224
|
-
# expanded.should == html
|
225
|
-
# end
|
226
|
-
# end
|
227
|
-
end
|
209
|
+
end
|
data/wp2txt.gemspec
CHANGED
@@ -14,6 +14,7 @@ Gem::Specification.new do |s|
|
|
14
14
|
s.rubyforge_project = "wp2txt"
|
15
15
|
|
16
16
|
s.files = `git ls-files`.split("\n")
|
17
|
+
s.files -= ["data/*", "image/*"]
|
17
18
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
20
|
s.require_paths = ["lib"]
|
@@ -23,7 +24,10 @@ Gem::Specification.new do |s|
|
|
23
24
|
# s.add_development_dependency "rake"
|
24
25
|
|
25
26
|
s.add_dependency "nokogiri"
|
27
|
+
s.add_dependency "ruby-progressbar"
|
26
28
|
s.add_dependency "parallel"
|
27
29
|
s.add_dependency "htmlentities"
|
28
30
|
s.add_dependency "optimist"
|
31
|
+
s.add_dependency "pastel"
|
32
|
+
s.add_dependency "tty-spinner"
|
29
33
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ruby-progressbar
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: parallel
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,12 +80,39 @@ dependencies:
|
|
66
80
|
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: pastel
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: tty-spinner
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
69
111
|
description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
|
70
112
|
XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
71
113
|
email:
|
72
114
|
- yohasebe@gmail.com
|
73
115
|
executables:
|
74
|
-
- benchmark.rb
|
75
116
|
- wp2txt
|
76
117
|
extensions: []
|
77
118
|
extra_rdoc_files: []
|
@@ -81,20 +122,20 @@ files:
|
|
81
122
|
- LICENSE
|
82
123
|
- README.md
|
83
124
|
- Rakefile
|
84
|
-
- bin/benchmark.rb
|
85
125
|
- bin/wp2txt
|
86
126
|
- data/output_samples/testdata_en.txt
|
87
|
-
- data/output_samples/
|
127
|
+
- data/output_samples/testdata_en_category.txt
|
88
128
|
- data/output_samples/testdata_en_summary.txt
|
89
129
|
- data/output_samples/testdata_ja.txt
|
90
|
-
- data/output_samples/
|
130
|
+
- data/output_samples/testdata_ja_category.txt
|
91
131
|
- data/output_samples/testdata_ja_summary.txt
|
92
132
|
- data/testdata_en.bz2
|
93
133
|
- data/testdata_ja.bz2
|
134
|
+
- image/screenshot.png
|
135
|
+
- image/wp2txt-logo.svg
|
136
|
+
- image/wp2txt.svg
|
94
137
|
- lib/wp2txt.rb
|
95
138
|
- lib/wp2txt/article.rb
|
96
|
-
- lib/wp2txt/mw_api.rb
|
97
|
-
- lib/wp2txt/progressbar.rb
|
98
139
|
- lib/wp2txt/utils.rb
|
99
140
|
- lib/wp2txt/version.rb
|
100
141
|
- spec/spec_helper.rb
|
@@ -118,7 +159,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
159
|
- !ruby/object:Gem::Version
|
119
160
|
version: '0'
|
120
161
|
requirements: []
|
121
|
-
rubygems_version: 3.3.
|
162
|
+
rubygems_version: 3.3.3
|
122
163
|
signing_key:
|
123
164
|
specification_version: 4
|
124
165
|
summary: Wikipedia dump to text converter
|