wp2txt 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +34 -6
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +129 -155
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/lib/wp2txt.rb
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
1
|
+
# frozen_string_literal: true
|
5
2
|
|
6
3
|
require "nokogiri"
|
7
|
-
|
8
|
-
|
4
|
+
require_relative "wp2txt/article"
|
5
|
+
require_relative "wp2txt/utils"
|
9
6
|
|
10
7
|
module Wp2txt
|
11
8
|
class Splitter
|
@@ -15,51 +12,56 @@ module Wp2txt
|
|
15
12
|
@input_file = input_file
|
16
13
|
@output_dir = output_dir
|
17
14
|
@tfile_size = tfile_size
|
18
|
-
if bz2_gem
|
19
|
-
require "bzip2-ruby"
|
20
|
-
end
|
15
|
+
require "bzip2-ruby" if bz2_gem
|
21
16
|
@bz2_gem = bz2_gem
|
22
17
|
prepare
|
23
18
|
end
|
24
19
|
|
25
20
|
def file_size(file)
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
21
|
+
size = 0
|
22
|
+
unit = 10_485_760
|
23
|
+
star = 0
|
24
|
+
before = Time.now.to_f
|
25
|
+
|
26
|
+
loop do
|
30
27
|
begin
|
31
28
|
a = file.read(unit)
|
32
|
-
rescue
|
29
|
+
rescue StandardError
|
33
30
|
a = nil
|
34
31
|
end
|
35
32
|
break unless a
|
36
33
|
|
37
34
|
present = Time.now.to_f
|
38
35
|
size += a.size
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
36
|
+
|
37
|
+
next if present - before <= 0.3
|
38
|
+
|
39
|
+
star = 0 if star > 10
|
40
|
+
star += 1
|
41
|
+
before = present
|
44
42
|
end
|
45
|
-
time_elapsed = Time.now - origin
|
46
43
|
size
|
47
44
|
end
|
48
45
|
|
49
46
|
# check if a given command exists: return the path if it does, return false if not
|
50
47
|
def command_exist?(command)
|
51
48
|
basename = File.basename(command)
|
52
|
-
path = ""
|
49
|
+
path = +""
|
53
50
|
print "Checking #{basename}: "
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
51
|
+
begin
|
52
|
+
if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
|
53
|
+
puts "detected [#{path}]"
|
54
|
+
path.strip
|
55
|
+
elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
|
56
|
+
puts "detected [#{path}]"
|
57
|
+
path.strip
|
58
|
+
else
|
59
|
+
puts "#{basename} not found"
|
60
|
+
false
|
61
|
+
end
|
62
|
+
rescue StandardError
|
63
|
+
puts "#{basename} not found"
|
64
|
+
false
|
63
65
|
end
|
64
66
|
end
|
65
67
|
|
@@ -67,28 +69,22 @@ module Wp2txt
|
|
67
69
|
def prepare
|
68
70
|
# if output_dir is not specified, output in the same directory
|
69
71
|
# as the imput file
|
70
|
-
if !@output_dir && @input_file
|
71
|
-
@output_dir = File.dirname(@input_file)
|
72
|
-
end
|
72
|
+
@output_dir = File.dirname(@input_file) if !@output_dir && @input_file
|
73
73
|
|
74
74
|
if /.bz2$/ =~ @input_file
|
75
75
|
if @bz2_gem
|
76
76
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
77
|
-
elsif
|
77
|
+
elsif Gem.win_platform?
|
78
78
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
79
|
-
|
80
|
-
|
81
|
-
command_exist?("pbzip2") ||
|
82
|
-
command_exist?("bzip2")
|
83
|
-
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
84
|
-
end
|
79
|
+
elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
|
80
|
+
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
85
81
|
end
|
86
82
|
else # meaning that it is a text file
|
87
83
|
@infile_size = File.stat(@input_file).size
|
88
84
|
file = open(@input_file)
|
89
85
|
end
|
90
86
|
|
91
|
-
#create basename of output file
|
87
|
+
# create basename of output file
|
92
88
|
@outfile_base = File.basename(@input_file, ".*") + "-"
|
93
89
|
@total_size = 0
|
94
90
|
@file_index = 1
|
@@ -97,15 +93,15 @@ module Wp2txt
|
|
97
93
|
@outfiles << outfilename
|
98
94
|
@fp = File.open(outfilename, "w")
|
99
95
|
@file_pointer = file
|
100
|
-
|
96
|
+
true
|
101
97
|
end
|
102
98
|
|
103
99
|
# read text data from bz2 compressed file by 1 megabyte
|
104
100
|
def fill_buffer
|
105
|
-
|
101
|
+
loop do
|
106
102
|
begin
|
107
|
-
new_lines = @file_pointer.read(
|
108
|
-
rescue
|
103
|
+
new_lines = @file_pointer.read(10_485_760)
|
104
|
+
rescue StandardError
|
109
105
|
return nil
|
110
106
|
end
|
111
107
|
return nil unless new_lines
|
@@ -113,68 +109,58 @@ module Wp2txt
|
|
113
109
|
# temp_buf is filled with text split by "\n"
|
114
110
|
temp_buf = []
|
115
111
|
ss = StringScanner.new(new_lines)
|
116
|
-
while ss.scan(/.*?\n/m)
|
117
|
-
temp_buf << ss[0]
|
118
|
-
end
|
112
|
+
temp_buf << ss[0] while ss.scan(/.*?\n/m)
|
119
113
|
temp_buf << ss.rest unless ss.eos?
|
120
114
|
|
121
115
|
new_first_line = temp_buf.shift
|
122
|
-
|
123
|
-
|
124
|
-
@buffer << ""
|
125
|
-
else
|
126
|
-
@buffer.last << new_first_line
|
127
|
-
end
|
116
|
+
@buffer.last << new_first_line
|
117
|
+
@buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
128
118
|
@buffer += temp_buf unless temp_buf.empty?
|
129
|
-
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
130
|
-
@buffer << ""
|
131
|
-
end
|
119
|
+
@buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
132
120
|
break if @buffer.size > 1
|
133
121
|
end
|
134
|
-
|
122
|
+
true
|
135
123
|
end
|
136
124
|
|
137
125
|
def get_newline
|
138
|
-
@buffer ||= [""]
|
139
|
-
if @buffer.size == 1
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
return nil
|
126
|
+
@buffer ||= [+""]
|
127
|
+
if @buffer.size == 1 && !fill_buffer
|
128
|
+
nil
|
129
|
+
elsif @buffer.empty?
|
130
|
+
nil
|
144
131
|
else
|
145
|
-
|
146
|
-
return new_line
|
132
|
+
@buffer.shift
|
147
133
|
end
|
148
134
|
end
|
149
135
|
|
150
136
|
def split_file
|
151
|
-
output_text = ""
|
137
|
+
output_text = +""
|
152
138
|
end_flag = false
|
153
|
-
while text = get_newline
|
154
|
-
@count ||= 0
|
155
|
-
@
|
139
|
+
while (text = get_newline)
|
140
|
+
@count ||= 0
|
141
|
+
@count += 1
|
142
|
+
@size_read ||= 0
|
156
143
|
@size_read += text.bytesize
|
157
144
|
@total_size += text.bytesize
|
158
145
|
output_text << text
|
159
146
|
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
160
147
|
# never close the file until the end of the page even if end_flag is on
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
end
|
148
|
+
next unless end_flag && %r{</page} =~ text
|
149
|
+
|
150
|
+
@fp.puts(output_text)
|
151
|
+
output_text = +""
|
152
|
+
@total_size = 0
|
153
|
+
end_flag = false
|
154
|
+
@fp.close
|
155
|
+
@file_index += 1
|
156
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
157
|
+
@outfiles << outfilename
|
158
|
+
@fp = File.open(outfilename, "w")
|
173
159
|
end
|
174
160
|
@fp.puts(output_text) if output_text != ""
|
175
161
|
@fp.close
|
176
162
|
|
177
|
-
if File.size(outfilename)
|
163
|
+
if File.size(outfilename).zero?
|
178
164
|
File.delete(outfilename)
|
179
165
|
@outfiles.delete(outfilename)
|
180
166
|
end
|
@@ -201,14 +187,14 @@ module Wp2txt
|
|
201
187
|
@file_pointer = file
|
202
188
|
@outfile_base = File.basename(@input_file, ".*")
|
203
189
|
@total_size = 0
|
204
|
-
|
190
|
+
true
|
205
191
|
end
|
206
192
|
|
207
193
|
def fill_buffer
|
208
|
-
|
194
|
+
loop do
|
209
195
|
begin
|
210
|
-
new_lines = @file_pointer.read(
|
211
|
-
rescue
|
196
|
+
new_lines = @file_pointer.read(10_485_760)
|
197
|
+
rescue StandardError
|
212
198
|
return nil
|
213
199
|
end
|
214
200
|
return nil unless new_lines
|
@@ -216,49 +202,40 @@ module Wp2txt
|
|
216
202
|
# temp_buf is filled with text split by "\n"
|
217
203
|
temp_buf = []
|
218
204
|
ss = StringScanner.new(new_lines)
|
219
|
-
while ss.scan(/.*?\n/m)
|
220
|
-
temp_buf << ss[0]
|
221
|
-
end
|
205
|
+
temp_buf << ss[0] while ss.scan(/.*?\n/m)
|
222
206
|
temp_buf << ss.rest unless ss.eos?
|
223
207
|
|
224
208
|
new_first_line = temp_buf.shift
|
225
|
-
|
226
|
-
|
227
|
-
@buffer << ""
|
228
|
-
else
|
229
|
-
@buffer.last << new_first_line
|
230
|
-
end
|
209
|
+
@buffer.last << new_first_line
|
210
|
+
@buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
|
231
211
|
@buffer += temp_buf unless temp_buf.empty?
|
232
|
-
if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
233
|
-
@buffer << ""
|
234
|
-
end
|
212
|
+
@buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
|
235
213
|
break if @buffer.size > 1
|
236
214
|
end
|
237
|
-
|
215
|
+
true
|
238
216
|
end
|
239
217
|
|
240
218
|
def get_newline
|
241
|
-
@buffer ||= [""]
|
242
|
-
if @buffer.size == 1
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
return nil
|
219
|
+
@buffer ||= [+""]
|
220
|
+
if @buffer.size == 1 && !fill_buffer
|
221
|
+
nil
|
222
|
+
elsif @buffer.empty?
|
223
|
+
nil
|
247
224
|
else
|
248
|
-
|
249
|
-
return new_line
|
225
|
+
@buffer.shift
|
250
226
|
end
|
251
227
|
end
|
252
228
|
|
253
229
|
def get_page
|
254
230
|
inside_page = false
|
255
|
-
page = ""
|
256
|
-
while line = get_newline
|
257
|
-
|
231
|
+
page = +""
|
232
|
+
while (line = get_newline)
|
233
|
+
case line
|
234
|
+
when /<page>/
|
258
235
|
page << line
|
259
236
|
inside_page = true
|
260
237
|
next
|
261
|
-
|
238
|
+
when %r{</page>}
|
262
239
|
page << line
|
263
240
|
inside_page = false
|
264
241
|
break
|
@@ -266,65 +243,62 @@ module Wp2txt
|
|
266
243
|
page << line if inside_page
|
267
244
|
end
|
268
245
|
if page.empty?
|
269
|
-
|
246
|
+
false
|
270
247
|
else
|
271
|
-
|
248
|
+
page.force_encoding("utf-8")
|
272
249
|
end
|
250
|
+
rescue StandardError
|
251
|
+
page
|
273
252
|
end
|
274
253
|
|
275
254
|
def extract_text(&block)
|
276
|
-
in_text = false
|
277
|
-
in_message = false
|
278
|
-
result_text = ""
|
279
255
|
title = nil
|
280
|
-
|
281
|
-
terminal_round = false
|
282
|
-
output_text = ""
|
256
|
+
output_text = +""
|
283
257
|
pages = []
|
284
258
|
data_empty = false
|
285
259
|
|
286
|
-
|
287
|
-
|
288
|
-
if
|
289
|
-
pages <<
|
260
|
+
until data_empty
|
261
|
+
new_page = get_page
|
262
|
+
if new_page
|
263
|
+
pages << new_page
|
290
264
|
else
|
291
265
|
data_empty = true
|
292
266
|
end
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
article = Article.new(text, title, @strip_tmarker)
|
313
|
-
page_text = block.call(article)
|
314
|
-
output_text << page_text
|
267
|
+
next unless data_empty
|
268
|
+
|
269
|
+
pages.each do |page|
|
270
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
271
|
+
xml = xmlns + page + "</mediawiki>"
|
272
|
+
|
273
|
+
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
274
|
+
page = input.xpath("//xmlns:text").first
|
275
|
+
pp_title = page.parent.parent.at_css "title"
|
276
|
+
title = pp_title.content
|
277
|
+
next if /:/ =~ title
|
278
|
+
|
279
|
+
text = page.content
|
280
|
+
text.gsub!(/<!--(.*?)-->/m) do |content|
|
281
|
+
num_of_newlines = content.count("\n")
|
282
|
+
if num_of_newlines.zero?
|
283
|
+
+""
|
284
|
+
else
|
285
|
+
"\n" * num_of_newlines
|
315
286
|
end
|
316
287
|
end
|
288
|
+
article = Article.new(text, title, @strip_tmarker)
|
289
|
+
page_text = block.call(article)
|
290
|
+
output_text << page_text
|
291
|
+
end
|
317
292
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
end
|
325
|
-
File.delete(@input_file) if @del_interfile
|
326
|
-
output_text = ""
|
293
|
+
output_text = cleanup(output_text)
|
294
|
+
unless output_text.empty?
|
295
|
+
outfilename = File.join(@output_dir, @outfile_base + ".txt")
|
296
|
+
@fp = File.open(outfilename, "w")
|
297
|
+
@fp.puts(output_text)
|
298
|
+
@fp.close
|
327
299
|
end
|
300
|
+
File.delete(@input_file) if @del_interfile
|
301
|
+
output_text = +""
|
328
302
|
end
|
329
303
|
end
|
330
304
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "rspec"
|
3
4
|
|
4
5
|
RSpec.configure do |config|
|
5
|
-
|
6
|
-
end
|
6
|
+
end
|