wp2txt 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +26 -3
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/lib/wp2txt.rb
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
1
|
+
# frozen_string_literal: true
|
5
2
|
|
6
3
|
require "nokogiri"
|
7
|
-
|
8
|
-
|
4
|
+
require_relative "wp2txt/article"
|
5
|
+
require_relative "wp2txt/utils"
|
9
6
|
|
10
7
|
module Wp2txt
|
11
8
|
class Splitter
|
@@ -15,51 +12,51 @@ module Wp2txt
|
|
15
12
|
@input_file = input_file
|
16
13
|
@output_dir = output_dir
|
17
14
|
@tfile_size = tfile_size
|
18
|
-
if bz2_gem
|
19
|
-
require "bzip2-ruby"
|
20
|
-
end
|
15
|
+
require "bzip2-ruby" if bz2_gem
|
21
16
|
@bz2_gem = bz2_gem
|
22
17
|
prepare
|
23
18
|
end
|
24
19
|
|
25
20
|
def file_size(file)
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
21
|
+
size = 0
|
22
|
+
unit = 10_485_760
|
23
|
+
star = 0
|
24
|
+
before = Time.now.to_f
|
25
|
+
|
26
|
+
loop do
|
30
27
|
begin
|
31
28
|
a = file.read(unit)
|
32
|
-
rescue
|
29
|
+
rescue StandardError
|
33
30
|
a = nil
|
34
31
|
end
|
35
32
|
break unless a
|
36
33
|
|
37
34
|
present = Time.now.to_f
|
38
35
|
size += a.size
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
36
|
+
|
37
|
+
next if present - before <= 0.3
|
38
|
+
|
39
|
+
star = 0 if star > 10
|
40
|
+
star += 1
|
41
|
+
before = present
|
44
42
|
end
|
45
|
-
time_elapsed = Time.now - origin
|
46
43
|
size
|
47
44
|
end
|
48
45
|
|
49
46
|
# check if a given command exists: return the path if it does, return false if not
|
50
47
|
def command_exist?(command)
|
51
48
|
basename = File.basename(command)
|
52
|
-
path = ""
|
49
|
+
path = +""
|
53
50
|
print "Checking #{basename}: "
|
54
|
-
if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
|
51
|
+
if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
|
55
52
|
puts "detected [#{path}]"
|
56
|
-
|
57
|
-
elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
|
53
|
+
path.strip
|
54
|
+
elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
|
58
55
|
puts "detected [#{path}]"
|
59
|
-
|
56
|
+
path.strip
|
60
57
|
else
|
61
58
|
puts "not found"
|
62
|
-
|
59
|
+
false
|
63
60
|
end
|
64
61
|
end
|
65
62
|
|
@@ -67,28 +64,22 @@ module Wp2txt
|
|
67
64
|
def prepare
|
68
65
|
# if output_dir is not specified, output in the same directory
|
69
66
|
# as the imput file
|
70
|
-
if !@output_dir && @input_file
|
71
|
-
@output_dir = File.dirname(@input_file)
|
72
|
-
end
|
67
|
+
@output_dir = File.dirname(@input_file) if !@output_dir && @input_file
|
73
68
|
|
74
69
|
if /.bz2$/ =~ @input_file
|
75
70
|
if @bz2_gem
|
76
71
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
77
72
|
elsif RUBY_PLATFORM.index("win32")
|
78
73
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
79
|
-
|
80
|
-
|
81
|
-
command_exist?("pbzip2") ||
|
82
|
-
command_exist?("bzip2")
|
83
|
-
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
84
|
-
end
|
74
|
+
elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
|
75
|
+
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
85
76
|
end
|
86
77
|
else # meaning that it is a text file
|
87
78
|
@infile_size = File.stat(@input_file).size
|
88
79
|
file = open(@input_file)
|
89
80
|
end
|
90
81
|
|
91
|
-
#create basename of output file
|
82
|
+
# create basename of output file
|
92
83
|
@outfile_base = File.basename(@input_file, ".*") + "-"
|
93
84
|
@total_size = 0
|
94
85
|
@file_index = 1
|
@@ -97,15 +88,15 @@ module Wp2txt
|
|
97
88
|
@outfiles << outfilename
|
98
89
|
@fp = File.open(outfilename, "w")
|
99
90
|
@file_pointer = file
|
100
|
-
|
91
|
+
true
|
101
92
|
end
|
102
93
|
|
103
94
|
# read text data from bz2 compressed file by 1 megabyte
|
104
95
|
def fill_buffer
|
105
|
-
|
96
|
+
loop do
|
106
97
|
begin
|
107
|
-
new_lines = @file_pointer.read(
|
108
|
-
rescue
|
98
|
+
new_lines = @file_pointer.read(10_485_760)
|
99
|
+
rescue StandardError
|
109
100
|
return nil
|
110
101
|
end
|
111
102
|
return nil unless new_lines
|
@@ -113,68 +104,58 @@ module Wp2txt
|
|
113
104
|
# temp_buf is filled with text split by "\n"
|
114
105
|
temp_buf = []
|
115
106
|
ss = StringScanner.new(new_lines)
|
116
|
-
while ss.scan(/.*?\n/m)
|
117
|
-
temp_buf << ss[0]
|
118
|
-
end
|
107
|
+
temp_buf << ss[0] while ss.scan(/.*?\n/m)
|
119
108
|
temp_buf << ss.rest unless ss.eos?
|
120
109
|
|
121
110
|
new_first_line = temp_buf.shift
|
122
|
-
|
123
|
-
|
124
|
-
@buffer << ""
|
125
|
-
else
|
126
|
-
@buffer.last << new_first_line
|
127
|
-
end
|
111
|
+
@buffer.last << new_first_line
|
112
|
+
@buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
128
113
|
@buffer += temp_buf unless temp_buf.empty?
|
129
|
-
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
130
|
-
@buffer << ""
|
131
|
-
end
|
114
|
+
@buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
132
115
|
break if @buffer.size > 1
|
133
116
|
end
|
134
|
-
|
117
|
+
true
|
135
118
|
end
|
136
119
|
|
137
120
|
def get_newline
|
138
|
-
@buffer ||= [""]
|
139
|
-
if @buffer.size == 1
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
return nil
|
121
|
+
@buffer ||= [+""]
|
122
|
+
if @buffer.size == 1 && !fill_buffer
|
123
|
+
nil
|
124
|
+
elsif @buffer.empty?
|
125
|
+
nil
|
144
126
|
else
|
145
|
-
|
146
|
-
return new_line
|
127
|
+
@buffer.shift
|
147
128
|
end
|
148
129
|
end
|
149
130
|
|
150
131
|
def split_file
|
151
|
-
output_text = ""
|
132
|
+
output_text = +""
|
152
133
|
end_flag = false
|
153
|
-
while text = get_newline
|
154
|
-
@count ||= 0
|
155
|
-
@
|
134
|
+
while (text = get_newline)
|
135
|
+
@count ||= 0
|
136
|
+
@count += 1
|
137
|
+
@size_read ||= 0
|
156
138
|
@size_read += text.bytesize
|
157
139
|
@total_size += text.bytesize
|
158
140
|
output_text << text
|
159
141
|
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
160
142
|
# never close the file until the end of the page even if end_flag is on
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
end
|
143
|
+
next unless end_flag && %r{</page} =~ text
|
144
|
+
|
145
|
+
@fp.puts(output_text)
|
146
|
+
output_text = +""
|
147
|
+
@total_size = 0
|
148
|
+
end_flag = false
|
149
|
+
@fp.close
|
150
|
+
@file_index += 1
|
151
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
152
|
+
@outfiles << outfilename
|
153
|
+
@fp = File.open(outfilename, "w")
|
173
154
|
end
|
174
155
|
@fp.puts(output_text) if output_text != ""
|
175
156
|
@fp.close
|
176
157
|
|
177
|
-
if File.size(outfilename)
|
158
|
+
if File.size(outfilename).zero?
|
178
159
|
File.delete(outfilename)
|
179
160
|
@outfiles.delete(outfilename)
|
180
161
|
end
|
@@ -201,14 +182,14 @@ module Wp2txt
|
|
201
182
|
@file_pointer = file
|
202
183
|
@outfile_base = File.basename(@input_file, ".*")
|
203
184
|
@total_size = 0
|
204
|
-
|
185
|
+
true
|
205
186
|
end
|
206
187
|
|
207
188
|
def fill_buffer
|
208
|
-
|
189
|
+
loop do
|
209
190
|
begin
|
210
|
-
new_lines = @file_pointer.read(
|
211
|
-
rescue
|
191
|
+
new_lines = @file_pointer.read(10_485_760)
|
192
|
+
rescue StandardError
|
212
193
|
return nil
|
213
194
|
end
|
214
195
|
return nil unless new_lines
|
@@ -216,49 +197,40 @@ module Wp2txt
|
|
216
197
|
# temp_buf is filled with text split by "\n"
|
217
198
|
temp_buf = []
|
218
199
|
ss = StringScanner.new(new_lines)
|
219
|
-
while ss.scan(/.*?\n/m)
|
220
|
-
temp_buf << ss[0]
|
221
|
-
end
|
200
|
+
temp_buf << ss[0] while ss.scan(/.*?\n/m)
|
222
201
|
temp_buf << ss.rest unless ss.eos?
|
223
202
|
|
224
203
|
new_first_line = temp_buf.shift
|
225
|
-
|
226
|
-
|
227
|
-
@buffer << ""
|
228
|
-
else
|
229
|
-
@buffer.last << new_first_line
|
230
|
-
end
|
204
|
+
@buffer.last << new_first_line
|
205
|
+
@buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
231
206
|
@buffer += temp_buf unless temp_buf.empty?
|
232
|
-
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
233
|
-
@buffer << ""
|
234
|
-
end
|
207
|
+
@buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
235
208
|
break if @buffer.size > 1
|
236
209
|
end
|
237
|
-
|
210
|
+
true
|
238
211
|
end
|
239
212
|
|
240
213
|
def get_newline
|
241
|
-
@buffer ||= [""]
|
242
|
-
if @buffer.size == 1
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
return nil
|
214
|
+
@buffer ||= [+""]
|
215
|
+
if @buffer.size == 1 && !fill_buffer
|
216
|
+
nil
|
217
|
+
elsif @buffer.empty?
|
218
|
+
nil
|
247
219
|
else
|
248
|
-
|
249
|
-
return new_line
|
220
|
+
@buffer.shift
|
250
221
|
end
|
251
222
|
end
|
252
223
|
|
253
224
|
def get_page
|
254
225
|
inside_page = false
|
255
|
-
page = ""
|
256
|
-
while line = get_newline
|
257
|
-
|
226
|
+
page = +""
|
227
|
+
while (line = get_newline)
|
228
|
+
case line
|
229
|
+
when /<page>/
|
258
230
|
page << line
|
259
231
|
inside_page = true
|
260
232
|
next
|
261
|
-
|
233
|
+
when %r{</page>}
|
262
234
|
page << line
|
263
235
|
inside_page = false
|
264
236
|
break
|
@@ -266,65 +238,62 @@ module Wp2txt
|
|
266
238
|
page << line if inside_page
|
267
239
|
end
|
268
240
|
if page.empty?
|
269
|
-
|
241
|
+
false
|
270
242
|
else
|
271
|
-
|
243
|
+
page.force_encoding("utf-8")
|
272
244
|
end
|
245
|
+
rescue StandardError
|
246
|
+
page
|
273
247
|
end
|
274
248
|
|
275
249
|
def extract_text(&block)
|
276
|
-
in_text = false
|
277
|
-
in_message = false
|
278
|
-
result_text = ""
|
279
250
|
title = nil
|
280
|
-
|
281
|
-
terminal_round = false
|
282
|
-
output_text = ""
|
251
|
+
output_text = +""
|
283
252
|
pages = []
|
284
253
|
data_empty = false
|
285
254
|
|
286
|
-
|
287
|
-
|
288
|
-
if
|
289
|
-
pages <<
|
255
|
+
until data_empty
|
256
|
+
new_page = get_page
|
257
|
+
if new_page
|
258
|
+
pages << new_page
|
290
259
|
else
|
291
260
|
data_empty = true
|
292
261
|
end
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
article = Article.new(text, title, @strip_tmarker)
|
313
|
-
page_text = block.call(article)
|
314
|
-
output_text << page_text
|
262
|
+
next unless data_empty
|
263
|
+
|
264
|
+
pages.each do |page|
|
265
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
266
|
+
xml = xmlns + page + "</mediawiki>"
|
267
|
+
|
268
|
+
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
269
|
+
page = input.xpath("//xmlns:text").first
|
270
|
+
pp_title = page.parent.parent.at_css "title"
|
271
|
+
title = pp_title.content
|
272
|
+
next if /:/ =~ title
|
273
|
+
|
274
|
+
text = page.content
|
275
|
+
text.gsub!(/<!--(.*?)-->/m) do |content|
|
276
|
+
num_of_newlines = content.count("\n")
|
277
|
+
if num_of_newlines.zero?
|
278
|
+
+""
|
279
|
+
else
|
280
|
+
"\n" * num_of_newlines
|
315
281
|
end
|
316
282
|
end
|
283
|
+
article = Article.new(text, title, @strip_tmarker)
|
284
|
+
page_text = block.call(article)
|
285
|
+
output_text << page_text
|
286
|
+
end
|
317
287
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
end
|
325
|
-
File.delete(@input_file) if @del_interfile
|
326
|
-
output_text = ""
|
288
|
+
output_text = cleanup(output_text)
|
289
|
+
unless output_text.empty?
|
290
|
+
outfilename = File.join(@output_dir, @outfile_base + ".txt")
|
291
|
+
@fp = File.open(outfilename, "w")
|
292
|
+
@fp.puts(output_text)
|
293
|
+
@fp.close
|
327
294
|
end
|
295
|
+
File.delete(@input_file) if @del_interfile
|
296
|
+
output_text = +""
|
328
297
|
end
|
329
298
|
end
|
330
299
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "rspec"
|
3
4
|
|
4
5
|
RSpec.configure do |config|
|
5
|
-
|
6
|
-
end
|
6
|
+
end
|