wp2txt 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wp2txt.rb CHANGED
@@ -1,11 +1,8 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
1
+ # frozen_string_literal: true
5
2
 
6
3
  require "nokogiri"
7
- require "wp2txt/article"
8
- require "wp2txt/utils"
4
+ require_relative "wp2txt/article"
5
+ require_relative "wp2txt/utils"
9
6
 
10
7
  module Wp2txt
11
8
  class Splitter
@@ -15,51 +12,51 @@ module Wp2txt
15
12
  @input_file = input_file
16
13
  @output_dir = output_dir
17
14
  @tfile_size = tfile_size
18
- if bz2_gem
19
- require "bzip2-ruby"
20
- end
15
+ require "bzip2-ruby" if bz2_gem
21
16
  @bz2_gem = bz2_gem
22
17
  prepare
23
18
  end
24
19
 
25
20
  def file_size(file)
26
- origin = Time.now
27
- size = 0; unit = 10485760; star = 0; before = Time.now.to_f
28
- error_count = 10
29
- while true do
21
+ size = 0
22
+ unit = 10_485_760
23
+ star = 0
24
+ before = Time.now.to_f
25
+
26
+ loop do
30
27
  begin
31
28
  a = file.read(unit)
32
- rescue => e
29
+ rescue StandardError
33
30
  a = nil
34
31
  end
35
32
  break unless a
36
33
 
37
34
  present = Time.now.to_f
38
35
  size += a.size
39
- if present - before > 0.3
40
- star = 0 if star > 10
41
- star += 1
42
- before = present
43
- end
36
+
37
+ next if present - before <= 0.3
38
+
39
+ star = 0 if star > 10
40
+ star += 1
41
+ before = present
44
42
  end
45
- time_elapsed = Time.now - origin
46
43
  size
47
44
  end
48
45
 
49
46
  # check if a given command exists: return the path if it does, return false if not
50
47
  def command_exist?(command)
51
48
  basename = File.basename(command)
52
- path = ""
49
+ path = +""
53
50
  print "Checking #{basename}: "
54
- if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
51
+ if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
55
52
  puts "detected [#{path}]"
56
- return path.strip
57
- elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
53
+ path.strip
54
+ elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
58
55
  puts "detected [#{path}]"
59
- return path.strip
56
+ path.strip
60
57
  else
61
58
  puts "not found"
62
- return false
59
+ false
63
60
  end
64
61
  end
65
62
 
@@ -67,28 +64,22 @@ module Wp2txt
67
64
  def prepare
68
65
  # if output_dir is not specified, output in the same directory
69
66
  # as the imput file
70
- if !@output_dir && @input_file
71
- @output_dir = File.dirname(@input_file)
72
- end
67
+ @output_dir = File.dirname(@input_file) if !@output_dir && @input_file
73
68
 
74
69
  if /.bz2$/ =~ @input_file
75
70
  if @bz2_gem
76
71
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
77
72
  elsif RUBY_PLATFORM.index("win32")
78
73
  file = IO.popen("bunzip2.exe -c #{@input_file}")
79
- else
80
- if bzpath = command_exist?("lbzip2") ||
81
- command_exist?("pbzip2") ||
82
- command_exist?("bzip2")
83
- file = IO.popen("#{bzpath} -c -d #{@input_file}")
84
- end
74
+ elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
75
+ file = IO.popen("#{bzpath} -c -d #{@input_file}")
85
76
  end
86
77
  else # meaning that it is a text file
87
78
  @infile_size = File.stat(@input_file).size
88
79
  file = open(@input_file)
89
80
  end
90
81
 
91
- #create basename of output file
82
+ # create basename of output file
92
83
  @outfile_base = File.basename(@input_file, ".*") + "-"
93
84
  @total_size = 0
94
85
  @file_index = 1
@@ -97,15 +88,15 @@ module Wp2txt
97
88
  @outfiles << outfilename
98
89
  @fp = File.open(outfilename, "w")
99
90
  @file_pointer = file
100
- return true
91
+ true
101
92
  end
102
93
 
103
94
  # read text data from bz2 compressed file by 1 megabyte
104
95
  def fill_buffer
105
- while true do
96
+ loop do
106
97
  begin
107
- new_lines = @file_pointer.read(10485760)
108
- rescue => e
98
+ new_lines = @file_pointer.read(10_485_760)
99
+ rescue StandardError
109
100
  return nil
110
101
  end
111
102
  return nil unless new_lines
@@ -113,68 +104,58 @@ module Wp2txt
113
104
  # temp_buf is filled with text split by "\n"
114
105
  temp_buf = []
115
106
  ss = StringScanner.new(new_lines)
116
- while ss.scan(/.*?\n/m)
117
- temp_buf << ss[0]
118
- end
107
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
119
108
  temp_buf << ss.rest unless ss.eos?
120
109
 
121
110
  new_first_line = temp_buf.shift
122
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
123
- @buffer.last << new_first_line
124
- @buffer << ""
125
- else
126
- @buffer.last << new_first_line
127
- end
111
+ @buffer.last << new_first_line
112
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
128
113
  @buffer += temp_buf unless temp_buf.empty?
129
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
130
- @buffer << ""
131
- end
114
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
132
115
  break if @buffer.size > 1
133
116
  end
134
- return true
117
+ true
135
118
  end
136
119
 
137
120
  def get_newline
138
- @buffer ||= [""]
139
- if @buffer.size == 1
140
- return nil unless fill_buffer
141
- end
142
- if @buffer.empty?
143
- return nil
121
+ @buffer ||= [+""]
122
+ if @buffer.size == 1 && !fill_buffer
123
+ nil
124
+ elsif @buffer.empty?
125
+ nil
144
126
  else
145
- new_line = @buffer.shift
146
- return new_line
127
+ @buffer.shift
147
128
  end
148
129
  end
149
130
 
150
131
  def split_file
151
- output_text = ""
132
+ output_text = +""
152
133
  end_flag = false
153
- while text = get_newline
154
- @count ||= 0;@count += 1;
155
- @size_read ||=0
134
+ while (text = get_newline)
135
+ @count ||= 0
136
+ @count += 1
137
+ @size_read ||= 0
156
138
  @size_read += text.bytesize
157
139
  @total_size += text.bytesize
158
140
  output_text << text
159
141
  end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
160
142
  # never close the file until the end of the page even if end_flag is on
161
- if end_flag && /<\/page/ =~ text
162
- @fp.puts(output_text)
163
- output_text = ""
164
- @total_size = 0
165
- end_flag = false
166
- @fp.close
167
- @file_index += 1
168
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
169
- @outfiles << outfilename
170
- @fp = File.open(outfilename, "w")
171
- next
172
- end
143
+ next unless end_flag && %r{</page} =~ text
144
+
145
+ @fp.puts(output_text)
146
+ output_text = +""
147
+ @total_size = 0
148
+ end_flag = false
149
+ @fp.close
150
+ @file_index += 1
151
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
152
+ @outfiles << outfilename
153
+ @fp = File.open(outfilename, "w")
173
154
  end
174
155
  @fp.puts(output_text) if output_text != ""
175
156
  @fp.close
176
157
 
177
- if File.size(outfilename) == 0
158
+ if File.size(outfilename).zero?
178
159
  File.delete(outfilename)
179
160
  @outfiles.delete(outfilename)
180
161
  end
@@ -201,14 +182,14 @@ module Wp2txt
201
182
  @file_pointer = file
202
183
  @outfile_base = File.basename(@input_file, ".*")
203
184
  @total_size = 0
204
- return true
185
+ true
205
186
  end
206
187
 
207
188
  def fill_buffer
208
- while true do
189
+ loop do
209
190
  begin
210
- new_lines = @file_pointer.read(10485760)
211
- rescue => e
191
+ new_lines = @file_pointer.read(10_485_760)
192
+ rescue StandardError
212
193
  return nil
213
194
  end
214
195
  return nil unless new_lines
@@ -216,49 +197,40 @@ module Wp2txt
216
197
  # temp_buf is filled with text split by "\n"
217
198
  temp_buf = []
218
199
  ss = StringScanner.new(new_lines)
219
- while ss.scan(/.*?\n/m)
220
- temp_buf << ss[0]
221
- end
200
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
222
201
  temp_buf << ss.rest unless ss.eos?
223
202
 
224
203
  new_first_line = temp_buf.shift
225
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
226
- @buffer.last << new_first_line
227
- @buffer << ""
228
- else
229
- @buffer.last << new_first_line
230
- end
204
+ @buffer.last << new_first_line
205
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
231
206
  @buffer += temp_buf unless temp_buf.empty?
232
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
233
- @buffer << ""
234
- end
207
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
235
208
  break if @buffer.size > 1
236
209
  end
237
- return true
210
+ true
238
211
  end
239
212
 
240
213
  def get_newline
241
- @buffer ||= [""]
242
- if @buffer.size == 1
243
- return nil unless fill_buffer
244
- end
245
- if @buffer.empty?
246
- return nil
214
+ @buffer ||= [+""]
215
+ if @buffer.size == 1 && !fill_buffer
216
+ nil
217
+ elsif @buffer.empty?
218
+ nil
247
219
  else
248
- new_line = @buffer.shift
249
- return new_line
220
+ @buffer.shift
250
221
  end
251
222
  end
252
223
 
253
224
  def get_page
254
225
  inside_page = false
255
- page = ""
256
- while line = get_newline
257
- if /<page>/ =~ line #
226
+ page = +""
227
+ while (line = get_newline)
228
+ case line
229
+ when /<page>/
258
230
  page << line
259
231
  inside_page = true
260
232
  next
261
- elsif /<\/page>/ =~ line #
233
+ when %r{</page>}
262
234
  page << line
263
235
  inside_page = false
264
236
  break
@@ -266,65 +238,62 @@ module Wp2txt
266
238
  page << line if inside_page
267
239
  end
268
240
  if page.empty?
269
- return false
241
+ false
270
242
  else
271
- return page.force_encoding("utf-8") rescue page
243
+ page.force_encoding("utf-8")
272
244
  end
245
+ rescue StandardError
246
+ page
273
247
  end
274
248
 
275
249
  def extract_text(&block)
276
- in_text = false
277
- in_message = false
278
- result_text = ""
279
250
  title = nil
280
- end_flag = false
281
- terminal_round = false
282
- output_text = ""
251
+ output_text = +""
283
252
  pages = []
284
253
  data_empty = false
285
254
 
286
- while !data_empty
287
- page = get_page
288
- if page
289
- pages << page
255
+ until data_empty
256
+ new_page = get_page
257
+ if new_page
258
+ pages << new_page
290
259
  else
291
260
  data_empty = true
292
261
  end
293
- if data_empty
294
- pages.each do |page|
295
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
296
- xml = xmlns + page + "</mediawiki>"
297
-
298
- input = Nokogiri::XML(xml, nil, 'UTF-8')
299
- page = input.xpath("//xmlns:text").first
300
- pp_title = page.parent.parent.at_css "title"
301
- title = pp_title.content
302
- unless /\:/ =~ title
303
- text = page.content
304
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
305
- num_of_newlines = content.count("\n")
306
- if num_of_newlines == 0
307
- ""
308
- else
309
- "\n" * num_of_newlines
310
- end
311
- end
312
- article = Article.new(text, title, @strip_tmarker)
313
- page_text = block.call(article)
314
- output_text << page_text
262
+ next unless data_empty
263
+
264
+ pages.each do |page|
265
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
266
+ xml = xmlns + page + "</mediawiki>"
267
+
268
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
269
+ page = input.xpath("//xmlns:text").first
270
+ pp_title = page.parent.parent.at_css "title"
271
+ title = pp_title.content
272
+ next if /:/ =~ title
273
+
274
+ text = page.content
275
+ text.gsub!(/<!--(.*?)-->/m) do |content|
276
+ num_of_newlines = content.count("\n")
277
+ if num_of_newlines.zero?
278
+ +""
279
+ else
280
+ "\n" * num_of_newlines
315
281
  end
316
282
  end
283
+ article = Article.new(text, title, @strip_tmarker)
284
+ page_text = block.call(article)
285
+ output_text << page_text
286
+ end
317
287
 
318
- cleanup!(output_text)
319
- if output_text.size > 0
320
- outfilename = File.join(@output_dir, @outfile_base + ".txt")
321
- @fp = File.open(outfilename, "w")
322
- @fp.puts(output_text)
323
- @fp.close
324
- end
325
- File.delete(@input_file) if @del_interfile
326
- output_text = ""
288
+ output_text = cleanup(output_text)
289
+ unless output_text.empty?
290
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
291
+ @fp = File.open(outfilename, "w")
292
+ @fp.puts(output_text)
293
+ @fp.close
327
294
  end
295
+ File.delete(@input_file) if @del_interfile
296
+ output_text = +""
328
297
  end
329
298
  end
330
299
  end
data/spec/spec_helper.rb CHANGED
@@ -1,6 +1,6 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require 'rspec'
1
+ # frozen_string_literal: true
2
+
3
+ require "rspec"
3
4
 
4
5
  RSpec.configure do |config|
5
- # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
- end
6
+ end