wp2txt 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wp2txt.rb CHANGED
@@ -1,11 +1,8 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
1
+ # frozen_string_literal: true
5
2
 
6
3
  require "nokogiri"
7
- require "wp2txt/article"
8
- require "wp2txt/utils"
4
+ require_relative "wp2txt/article"
5
+ require_relative "wp2txt/utils"
9
6
 
10
7
  module Wp2txt
11
8
  class Splitter
@@ -15,51 +12,56 @@ module Wp2txt
15
12
  @input_file = input_file
16
13
  @output_dir = output_dir
17
14
  @tfile_size = tfile_size
18
- if bz2_gem
19
- require "bzip2-ruby"
20
- end
15
+ require "bzip2-ruby" if bz2_gem
21
16
  @bz2_gem = bz2_gem
22
17
  prepare
23
18
  end
24
19
 
25
20
  def file_size(file)
26
- origin = Time.now
27
- size = 0; unit = 10485760; star = 0; before = Time.now.to_f
28
- error_count = 10
29
- while true do
21
+ size = 0
22
+ unit = 10_485_760
23
+ star = 0
24
+ before = Time.now.to_f
25
+
26
+ loop do
30
27
  begin
31
28
  a = file.read(unit)
32
- rescue => e
29
+ rescue StandardError
33
30
  a = nil
34
31
  end
35
32
  break unless a
36
33
 
37
34
  present = Time.now.to_f
38
35
  size += a.size
39
- if present - before > 0.3
40
- star = 0 if star > 10
41
- star += 1
42
- before = present
43
- end
36
+
37
+ next if present - before <= 0.3
38
+
39
+ star = 0 if star > 10
40
+ star += 1
41
+ before = present
44
42
  end
45
- time_elapsed = Time.now - origin
46
43
  size
47
44
  end
48
45
 
49
46
  # check if a given command exists: return the path if it does, return false if not
50
47
  def command_exist?(command)
51
48
  basename = File.basename(command)
52
- path = ""
49
+ path = +""
53
50
  print "Checking #{basename}: "
54
- if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
55
- puts "detected [#{path}]"
56
- return path.strip
57
- elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
58
- puts "detected [#{path}]"
59
- return path.strip
60
- else
61
- puts "not found"
62
- return false
51
+ begin
52
+ if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
53
+ puts "detected [#{path}]"
54
+ path.strip
55
+ elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
56
+ puts "detected [#{path}]"
57
+ path.strip
58
+ else
59
+ puts "#{basename} not found"
60
+ false
61
+ end
62
+ rescue StandardError
63
+ puts "#{basename} not found"
64
+ false
63
65
  end
64
66
  end
65
67
 
@@ -67,28 +69,22 @@ module Wp2txt
67
69
  def prepare
68
70
  # if output_dir is not specified, output in the same directory
69
71
  # as the imput file
70
- if !@output_dir && @input_file
71
- @output_dir = File.dirname(@input_file)
72
- end
72
+ @output_dir = File.dirname(@input_file) if !@output_dir && @input_file
73
73
 
74
74
  if /.bz2$/ =~ @input_file
75
75
  if @bz2_gem
76
76
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
77
- elsif RUBY_PLATFORM.index("win32")
77
+ elsif Gem.win_platform?
78
78
  file = IO.popen("bunzip2.exe -c #{@input_file}")
79
- else
80
- if bzpath = command_exist?("lbzip2") ||
81
- command_exist?("pbzip2") ||
82
- command_exist?("bzip2")
83
- file = IO.popen("#{bzpath} -c -d #{@input_file}")
84
- end
79
+ elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
80
+ file = IO.popen("#{bzpath} -c -d #{@input_file}")
85
81
  end
86
82
  else # meaning that it is a text file
87
83
  @infile_size = File.stat(@input_file).size
88
84
  file = open(@input_file)
89
85
  end
90
86
 
91
- #create basename of output file
87
+ # create basename of output file
92
88
  @outfile_base = File.basename(@input_file, ".*") + "-"
93
89
  @total_size = 0
94
90
  @file_index = 1
@@ -97,15 +93,15 @@ module Wp2txt
97
93
  @outfiles << outfilename
98
94
  @fp = File.open(outfilename, "w")
99
95
  @file_pointer = file
100
- return true
96
+ true
101
97
  end
102
98
 
103
99
  # read text data from bz2 compressed file by 1 megabyte
104
100
  def fill_buffer
105
- while true do
101
+ loop do
106
102
  begin
107
- new_lines = @file_pointer.read(10485760)
108
- rescue => e
103
+ new_lines = @file_pointer.read(10_485_760)
104
+ rescue StandardError
109
105
  return nil
110
106
  end
111
107
  return nil unless new_lines
@@ -113,68 +109,58 @@ module Wp2txt
113
109
  # temp_buf is filled with text split by "\n"
114
110
  temp_buf = []
115
111
  ss = StringScanner.new(new_lines)
116
- while ss.scan(/.*?\n/m)
117
- temp_buf << ss[0]
118
- end
112
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
119
113
  temp_buf << ss.rest unless ss.eos?
120
114
 
121
115
  new_first_line = temp_buf.shift
122
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
123
- @buffer.last << new_first_line
124
- @buffer << ""
125
- else
126
- @buffer.last << new_first_line
127
- end
116
+ @buffer.last << new_first_line
117
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
128
118
  @buffer += temp_buf unless temp_buf.empty?
129
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
130
- @buffer << ""
131
- end
119
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
132
120
  break if @buffer.size > 1
133
121
  end
134
- return true
122
+ true
135
123
  end
136
124
 
137
125
  def get_newline
138
- @buffer ||= [""]
139
- if @buffer.size == 1
140
- return nil unless fill_buffer
141
- end
142
- if @buffer.empty?
143
- return nil
126
+ @buffer ||= [+""]
127
+ if @buffer.size == 1 && !fill_buffer
128
+ nil
129
+ elsif @buffer.empty?
130
+ nil
144
131
  else
145
- new_line = @buffer.shift
146
- return new_line
132
+ @buffer.shift
147
133
  end
148
134
  end
149
135
 
150
136
  def split_file
151
- output_text = ""
137
+ output_text = +""
152
138
  end_flag = false
153
- while text = get_newline
154
- @count ||= 0;@count += 1;
155
- @size_read ||=0
139
+ while (text = get_newline)
140
+ @count ||= 0
141
+ @count += 1
142
+ @size_read ||= 0
156
143
  @size_read += text.bytesize
157
144
  @total_size += text.bytesize
158
145
  output_text << text
159
146
  end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
160
147
  # never close the file until the end of the page even if end_flag is on
161
- if end_flag && /<\/page/ =~ text
162
- @fp.puts(output_text)
163
- output_text = ""
164
- @total_size = 0
165
- end_flag = false
166
- @fp.close
167
- @file_index += 1
168
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
169
- @outfiles << outfilename
170
- @fp = File.open(outfilename, "w")
171
- next
172
- end
148
+ next unless end_flag && %r{</page} =~ text
149
+
150
+ @fp.puts(output_text)
151
+ output_text = +""
152
+ @total_size = 0
153
+ end_flag = false
154
+ @fp.close
155
+ @file_index += 1
156
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
157
+ @outfiles << outfilename
158
+ @fp = File.open(outfilename, "w")
173
159
  end
174
160
  @fp.puts(output_text) if output_text != ""
175
161
  @fp.close
176
162
 
177
- if File.size(outfilename) == 0
163
+ if File.size(outfilename).zero?
178
164
  File.delete(outfilename)
179
165
  @outfiles.delete(outfilename)
180
166
  end
@@ -201,14 +187,14 @@ module Wp2txt
201
187
  @file_pointer = file
202
188
  @outfile_base = File.basename(@input_file, ".*")
203
189
  @total_size = 0
204
- return true
190
+ true
205
191
  end
206
192
 
207
193
  def fill_buffer
208
- while true do
194
+ loop do
209
195
  begin
210
- new_lines = @file_pointer.read(10485760)
211
- rescue => e
196
+ new_lines = @file_pointer.read(10_485_760)
197
+ rescue StandardError
212
198
  return nil
213
199
  end
214
200
  return nil unless new_lines
@@ -216,49 +202,40 @@ module Wp2txt
216
202
  # temp_buf is filled with text split by "\n"
217
203
  temp_buf = []
218
204
  ss = StringScanner.new(new_lines)
219
- while ss.scan(/.*?\n/m)
220
- temp_buf << ss[0]
221
- end
205
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
222
206
  temp_buf << ss.rest unless ss.eos?
223
207
 
224
208
  new_first_line = temp_buf.shift
225
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
226
- @buffer.last << new_first_line
227
- @buffer << ""
228
- else
229
- @buffer.last << new_first_line
230
- end
209
+ @buffer.last << new_first_line
210
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
231
211
  @buffer += temp_buf unless temp_buf.empty?
232
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
233
- @buffer << ""
234
- end
212
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
235
213
  break if @buffer.size > 1
236
214
  end
237
- return true
215
+ true
238
216
  end
239
217
 
240
218
  def get_newline
241
- @buffer ||= [""]
242
- if @buffer.size == 1
243
- return nil unless fill_buffer
244
- end
245
- if @buffer.empty?
246
- return nil
219
+ @buffer ||= [+""]
220
+ if @buffer.size == 1 && !fill_buffer
221
+ nil
222
+ elsif @buffer.empty?
223
+ nil
247
224
  else
248
- new_line = @buffer.shift
249
- return new_line
225
+ @buffer.shift
250
226
  end
251
227
  end
252
228
 
253
229
  def get_page
254
230
  inside_page = false
255
- page = ""
256
- while line = get_newline
257
- if /<page>/ =~ line #
231
+ page = +""
232
+ while (line = get_newline)
233
+ case line
234
+ when /<page>/
258
235
  page << line
259
236
  inside_page = true
260
237
  next
261
- elsif /<\/page>/ =~ line #
238
+ when %r{</page>}
262
239
  page << line
263
240
  inside_page = false
264
241
  break
@@ -266,65 +243,62 @@ module Wp2txt
266
243
  page << line if inside_page
267
244
  end
268
245
  if page.empty?
269
- return false
246
+ false
270
247
  else
271
- return page.force_encoding("utf-8") rescue page
248
+ page.force_encoding("utf-8")
272
249
  end
250
+ rescue StandardError
251
+ page
273
252
  end
274
253
 
275
254
  def extract_text(&block)
276
- in_text = false
277
- in_message = false
278
- result_text = ""
279
255
  title = nil
280
- end_flag = false
281
- terminal_round = false
282
- output_text = ""
256
+ output_text = +""
283
257
  pages = []
284
258
  data_empty = false
285
259
 
286
- while !data_empty
287
- page = get_page
288
- if page
289
- pages << page
260
+ until data_empty
261
+ new_page = get_page
262
+ if new_page
263
+ pages << new_page
290
264
  else
291
265
  data_empty = true
292
266
  end
293
- if data_empty
294
- pages.each do |page|
295
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
296
- xml = xmlns + page + "</mediawiki>"
297
-
298
- input = Nokogiri::XML(xml, nil, 'UTF-8')
299
- page = input.xpath("//xmlns:text").first
300
- pp_title = page.parent.parent.at_css "title"
301
- title = pp_title.content
302
- unless /\:/ =~ title
303
- text = page.content
304
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
305
- num_of_newlines = content.count("\n")
306
- if num_of_newlines == 0
307
- ""
308
- else
309
- "\n" * num_of_newlines
310
- end
311
- end
312
- article = Article.new(text, title, @strip_tmarker)
313
- page_text = block.call(article)
314
- output_text << page_text
267
+ next unless data_empty
268
+
269
+ pages.each do |page|
270
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
271
+ xml = xmlns + page + "</mediawiki>"
272
+
273
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
274
+ page = input.xpath("//xmlns:text").first
275
+ pp_title = page.parent.parent.at_css "title"
276
+ title = pp_title.content
277
+ next if /:/ =~ title
278
+
279
+ text = page.content
280
+ text.gsub!(/<!--(.*?)-->/m) do |content|
281
+ num_of_newlines = content.count("\n")
282
+ if num_of_newlines.zero?
283
+ +""
284
+ else
285
+ "\n" * num_of_newlines
315
286
  end
316
287
  end
288
+ article = Article.new(text, title, @strip_tmarker)
289
+ page_text = block.call(article)
290
+ output_text << page_text
291
+ end
317
292
 
318
- cleanup!(output_text)
319
- if output_text.size > 0
320
- outfilename = File.join(@output_dir, @outfile_base + ".txt")
321
- @fp = File.open(outfilename, "w")
322
- @fp.puts(output_text)
323
- @fp.close
324
- end
325
- File.delete(@input_file) if @del_interfile
326
- output_text = ""
293
+ output_text = cleanup(output_text)
294
+ unless output_text.empty?
295
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
296
+ @fp = File.open(outfilename, "w")
297
+ @fp.puts(output_text)
298
+ @fp.close
327
299
  end
300
+ File.delete(@input_file) if @del_interfile
301
+ output_text = +""
328
302
  end
329
303
  end
330
304
  end
data/spec/spec_helper.rb CHANGED
@@ -1,6 +1,6 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require 'rspec'
1
+ # frozen_string_literal: true
2
+
3
+ require "rspec"
3
4
 
4
5
  RSpec.configure do |config|
5
- # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
- end
6
+ end