wp2txt 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +42 -13
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +172 -282
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -11
data/lib/wp2txt.rb
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
1
|
+
# frozen_string_literal: true
|
5
2
|
|
6
3
|
require "nokogiri"
|
7
|
-
|
8
|
-
|
4
|
+
require_relative "wp2txt/article"
|
5
|
+
require_relative "wp2txt/utils"
|
9
6
|
|
10
7
|
module Wp2txt
|
11
8
|
class Splitter
|
@@ -15,51 +12,51 @@ module Wp2txt
|
|
15
12
|
@input_file = input_file
|
16
13
|
@output_dir = output_dir
|
17
14
|
@tfile_size = tfile_size
|
18
|
-
if bz2_gem
|
19
|
-
require "bzip2-ruby"
|
20
|
-
end
|
15
|
+
require "bzip2-ruby" if bz2_gem
|
21
16
|
@bz2_gem = bz2_gem
|
22
17
|
prepare
|
23
18
|
end
|
24
19
|
|
25
20
|
def file_size(file)
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
21
|
+
size = 0
|
22
|
+
unit = 10_485_760
|
23
|
+
star = 0
|
24
|
+
before = Time.now.to_f
|
25
|
+
|
26
|
+
loop do
|
30
27
|
begin
|
31
28
|
a = file.read(unit)
|
32
|
-
rescue
|
29
|
+
rescue StandardError
|
33
30
|
a = nil
|
34
31
|
end
|
35
32
|
break unless a
|
36
33
|
|
37
34
|
present = Time.now.to_f
|
38
35
|
size += a.size
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
36
|
+
|
37
|
+
next if present - before <= 0.3
|
38
|
+
|
39
|
+
star = 0 if star > 10
|
40
|
+
star += 1
|
41
|
+
before = present
|
44
42
|
end
|
45
|
-
time_elapsed = Time.now - origin
|
46
43
|
size
|
47
44
|
end
|
48
45
|
|
49
46
|
# check if a given command exists: return the path if it does, return false if not
|
50
47
|
def command_exist?(command)
|
51
48
|
basename = File.basename(command)
|
52
|
-
path = ""
|
49
|
+
path = +""
|
53
50
|
print "Checking #{basename}: "
|
54
|
-
if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
|
51
|
+
if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
|
55
52
|
puts "detected [#{path}]"
|
56
|
-
|
57
|
-
elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
|
53
|
+
path.strip
|
54
|
+
elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
|
58
55
|
puts "detected [#{path}]"
|
59
|
-
|
56
|
+
path.strip
|
60
57
|
else
|
61
58
|
puts "not found"
|
62
|
-
|
59
|
+
false
|
63
60
|
end
|
64
61
|
end
|
65
62
|
|
@@ -67,28 +64,22 @@ module Wp2txt
|
|
67
64
|
def prepare
|
68
65
|
# if output_dir is not specified, output in the same directory
|
69
66
|
# as the imput file
|
70
|
-
if !@output_dir && @input_file
|
71
|
-
@output_dir = File.dirname(@input_file)
|
72
|
-
end
|
67
|
+
@output_dir = File.dirname(@input_file) if !@output_dir && @input_file
|
73
68
|
|
74
69
|
if /.bz2$/ =~ @input_file
|
75
70
|
if @bz2_gem
|
76
71
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
77
72
|
elsif RUBY_PLATFORM.index("win32")
|
78
73
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
79
|
-
|
80
|
-
|
81
|
-
command_exist?("pbzip2") ||
|
82
|
-
command_exist?("bzip2")
|
83
|
-
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
84
|
-
end
|
74
|
+
elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
|
75
|
+
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
85
76
|
end
|
86
77
|
else # meaning that it is a text file
|
87
78
|
@infile_size = File.stat(@input_file).size
|
88
79
|
file = open(@input_file)
|
89
80
|
end
|
90
81
|
|
91
|
-
#create basename of output file
|
82
|
+
# create basename of output file
|
92
83
|
@outfile_base = File.basename(@input_file, ".*") + "-"
|
93
84
|
@total_size = 0
|
94
85
|
@file_index = 1
|
@@ -97,15 +88,15 @@ module Wp2txt
|
|
97
88
|
@outfiles << outfilename
|
98
89
|
@fp = File.open(outfilename, "w")
|
99
90
|
@file_pointer = file
|
100
|
-
|
91
|
+
true
|
101
92
|
end
|
102
93
|
|
103
94
|
# read text data from bz2 compressed file by 1 megabyte
|
104
95
|
def fill_buffer
|
105
|
-
|
96
|
+
loop do
|
106
97
|
begin
|
107
|
-
new_lines = @file_pointer.read(
|
108
|
-
rescue
|
98
|
+
new_lines = @file_pointer.read(10_485_760)
|
99
|
+
rescue StandardError
|
109
100
|
return nil
|
110
101
|
end
|
111
102
|
return nil unless new_lines
|
@@ -113,68 +104,58 @@ module Wp2txt
|
|
113
104
|
# temp_buf is filled with text split by "\n"
|
114
105
|
temp_buf = []
|
115
106
|
ss = StringScanner.new(new_lines)
|
116
|
-
while ss.scan(/.*?\n/m)
|
117
|
-
temp_buf << ss[0]
|
118
|
-
end
|
107
|
+
temp_buf << ss[0] while ss.scan(/.*?\n/m)
|
119
108
|
temp_buf << ss.rest unless ss.eos?
|
120
109
|
|
121
110
|
new_first_line = temp_buf.shift
|
122
|
-
|
123
|
-
|
124
|
-
@buffer << ""
|
125
|
-
else
|
126
|
-
@buffer.last << new_first_line
|
127
|
-
end
|
111
|
+
@buffer.last << new_first_line
|
112
|
+
@buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
128
113
|
@buffer += temp_buf unless temp_buf.empty?
|
129
|
-
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
130
|
-
@buffer << ""
|
131
|
-
end
|
114
|
+
@buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
132
115
|
break if @buffer.size > 1
|
133
116
|
end
|
134
|
-
|
117
|
+
true
|
135
118
|
end
|
136
119
|
|
137
120
|
def get_newline
|
138
|
-
@buffer ||= [""]
|
139
|
-
if @buffer.size == 1
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
return nil
|
121
|
+
@buffer ||= [+""]
|
122
|
+
if @buffer.size == 1 && !fill_buffer
|
123
|
+
nil
|
124
|
+
elsif @buffer.empty?
|
125
|
+
nil
|
144
126
|
else
|
145
|
-
|
146
|
-
return new_line
|
127
|
+
@buffer.shift
|
147
128
|
end
|
148
129
|
end
|
149
130
|
|
150
131
|
def split_file
|
151
|
-
output_text = ""
|
132
|
+
output_text = +""
|
152
133
|
end_flag = false
|
153
|
-
while text = get_newline
|
154
|
-
@count ||= 0
|
155
|
-
@
|
134
|
+
while (text = get_newline)
|
135
|
+
@count ||= 0
|
136
|
+
@count += 1
|
137
|
+
@size_read ||= 0
|
156
138
|
@size_read += text.bytesize
|
157
139
|
@total_size += text.bytesize
|
158
140
|
output_text << text
|
159
141
|
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
160
142
|
# never close the file until the end of the page even if end_flag is on
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
end
|
143
|
+
next unless end_flag && %r{</page} =~ text
|
144
|
+
|
145
|
+
@fp.puts(output_text)
|
146
|
+
output_text = +""
|
147
|
+
@total_size = 0
|
148
|
+
end_flag = false
|
149
|
+
@fp.close
|
150
|
+
@file_index += 1
|
151
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
152
|
+
@outfiles << outfilename
|
153
|
+
@fp = File.open(outfilename, "w")
|
173
154
|
end
|
174
155
|
@fp.puts(output_text) if output_text != ""
|
175
156
|
@fp.close
|
176
157
|
|
177
|
-
if File.size(outfilename)
|
158
|
+
if File.size(outfilename).zero?
|
178
159
|
File.delete(outfilename)
|
179
160
|
@outfiles.delete(outfilename)
|
180
161
|
end
|
@@ -201,14 +182,14 @@ module Wp2txt
|
|
201
182
|
@file_pointer = file
|
202
183
|
@outfile_base = File.basename(@input_file, ".*")
|
203
184
|
@total_size = 0
|
204
|
-
|
185
|
+
true
|
205
186
|
end
|
206
187
|
|
207
188
|
def fill_buffer
|
208
|
-
|
189
|
+
loop do
|
209
190
|
begin
|
210
|
-
new_lines = @file_pointer.read(
|
211
|
-
rescue
|
191
|
+
new_lines = @file_pointer.read(10_485_760)
|
192
|
+
rescue StandardError
|
212
193
|
return nil
|
213
194
|
end
|
214
195
|
return nil unless new_lines
|
@@ -216,49 +197,40 @@ module Wp2txt
|
|
216
197
|
# temp_buf is filled with text split by "\n"
|
217
198
|
temp_buf = []
|
218
199
|
ss = StringScanner.new(new_lines)
|
219
|
-
while ss.scan(/.*?\n/m)
|
220
|
-
temp_buf << ss[0]
|
221
|
-
end
|
200
|
+
temp_buf << ss[0] while ss.scan(/.*?\n/m)
|
222
201
|
temp_buf << ss.rest unless ss.eos?
|
223
202
|
|
224
203
|
new_first_line = temp_buf.shift
|
225
|
-
|
226
|
-
|
227
|
-
@buffer << ""
|
228
|
-
else
|
229
|
-
@buffer.last << new_first_line
|
230
|
-
end
|
204
|
+
@buffer.last << new_first_line
|
205
|
+
@buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
231
206
|
@buffer += temp_buf unless temp_buf.empty?
|
232
|
-
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
233
|
-
@buffer << ""
|
234
|
-
end
|
207
|
+
@buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
235
208
|
break if @buffer.size > 1
|
236
209
|
end
|
237
|
-
|
210
|
+
true
|
238
211
|
end
|
239
212
|
|
240
213
|
def get_newline
|
241
|
-
@buffer ||= [""]
|
242
|
-
if @buffer.size == 1
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
return nil
|
214
|
+
@buffer ||= [+""]
|
215
|
+
if @buffer.size == 1 && !fill_buffer
|
216
|
+
nil
|
217
|
+
elsif @buffer.empty?
|
218
|
+
nil
|
247
219
|
else
|
248
|
-
|
249
|
-
return new_line
|
220
|
+
@buffer.shift
|
250
221
|
end
|
251
222
|
end
|
252
223
|
|
253
224
|
def get_page
|
254
225
|
inside_page = false
|
255
|
-
page = ""
|
256
|
-
while line = get_newline
|
257
|
-
|
226
|
+
page = +""
|
227
|
+
while (line = get_newline)
|
228
|
+
case line
|
229
|
+
when /<page>/
|
258
230
|
page << line
|
259
231
|
inside_page = true
|
260
232
|
next
|
261
|
-
|
233
|
+
when %r{</page>}
|
262
234
|
page << line
|
263
235
|
inside_page = false
|
264
236
|
break
|
@@ -266,65 +238,62 @@ module Wp2txt
|
|
266
238
|
page << line if inside_page
|
267
239
|
end
|
268
240
|
if page.empty?
|
269
|
-
|
241
|
+
false
|
270
242
|
else
|
271
|
-
|
243
|
+
page.force_encoding("utf-8")
|
272
244
|
end
|
245
|
+
rescue StandardError
|
246
|
+
page
|
273
247
|
end
|
274
248
|
|
275
249
|
def extract_text(&block)
|
276
|
-
in_text = false
|
277
|
-
in_message = false
|
278
|
-
result_text = ""
|
279
250
|
title = nil
|
280
|
-
|
281
|
-
terminal_round = false
|
282
|
-
output_text = ""
|
251
|
+
output_text = +""
|
283
252
|
pages = []
|
284
253
|
data_empty = false
|
285
254
|
|
286
|
-
|
287
|
-
|
288
|
-
if
|
289
|
-
pages <<
|
255
|
+
until data_empty
|
256
|
+
new_page = get_page
|
257
|
+
if new_page
|
258
|
+
pages << new_page
|
290
259
|
else
|
291
260
|
data_empty = true
|
292
261
|
end
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
article = Article.new(text, title, @strip_tmarker)
|
313
|
-
page_text = block.call(article)
|
314
|
-
output_text << page_text
|
262
|
+
next unless data_empty
|
263
|
+
|
264
|
+
pages.each do |page|
|
265
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
266
|
+
xml = xmlns + page + "</mediawiki>"
|
267
|
+
|
268
|
+
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
269
|
+
page = input.xpath("//xmlns:text").first
|
270
|
+
pp_title = page.parent.parent.at_css "title"
|
271
|
+
title = pp_title.content
|
272
|
+
next if /:/ =~ title
|
273
|
+
|
274
|
+
text = page.content
|
275
|
+
text.gsub!(/<!--(.*?)-->/m) do |content|
|
276
|
+
num_of_newlines = content.count("\n")
|
277
|
+
if num_of_newlines.zero?
|
278
|
+
+""
|
279
|
+
else
|
280
|
+
"\n" * num_of_newlines
|
315
281
|
end
|
316
282
|
end
|
283
|
+
article = Article.new(text, title, @strip_tmarker)
|
284
|
+
page_text = block.call(article)
|
285
|
+
output_text << page_text
|
286
|
+
end
|
317
287
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
end
|
325
|
-
File.delete(@input_file) if @del_interfile
|
326
|
-
output_text = ""
|
288
|
+
output_text = cleanup(output_text)
|
289
|
+
unless output_text.empty?
|
290
|
+
outfilename = File.join(@output_dir, @outfile_base + ".txt")
|
291
|
+
@fp = File.open(outfilename, "w")
|
292
|
+
@fp.puts(output_text)
|
293
|
+
@fp.close
|
327
294
|
end
|
295
|
+
File.delete(@input_file) if @del_interfile
|
296
|
+
output_text = +""
|
328
297
|
end
|
329
298
|
end
|
330
299
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "rspec"
|
3
4
|
|
4
5
|
RSpec.configure do |config|
|
5
|
-
|
6
|
-
end
|
6
|
+
end
|