wp2txt 1.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wp2txt.rb CHANGED
@@ -1,11 +1,8 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
1
+ # frozen_string_literal: true
5
2
 
6
3
  require "nokogiri"
7
- require "wp2txt/article"
8
- require "wp2txt/utils"
4
+ require_relative "wp2txt/article"
5
+ require_relative "wp2txt/utils"
9
6
 
10
7
  module Wp2txt
11
8
  class Splitter
@@ -15,51 +12,56 @@ module Wp2txt
15
12
  @input_file = input_file
16
13
  @output_dir = output_dir
17
14
  @tfile_size = tfile_size
18
- if bz2_gem
19
- require "bzip2-ruby"
20
- end
15
+ require "bzip2-ruby" if bz2_gem
21
16
  @bz2_gem = bz2_gem
22
17
  prepare
23
18
  end
24
19
 
25
20
  def file_size(file)
26
- origin = Time.now
27
- size = 0; unit = 10485760; star = 0; before = Time.now.to_f
28
- error_count = 10
29
- while true do
21
+ size = 0
22
+ unit = 10_485_760
23
+ star = 0
24
+ before = Time.now.to_f
25
+
26
+ loop do
30
27
  begin
31
28
  a = file.read(unit)
32
- rescue => e
29
+ rescue StandardError
33
30
  a = nil
34
31
  end
35
32
  break unless a
36
33
 
37
34
  present = Time.now.to_f
38
35
  size += a.size
39
- if present - before > 0.3
40
- star = 0 if star > 10
41
- star += 1
42
- before = present
43
- end
36
+
37
+ next if present - before <= 0.3
38
+
39
+ star = 0 if star > 10
40
+ star += 1
41
+ before = present
44
42
  end
45
- time_elapsed = Time.now - origin
46
43
  size
47
44
  end
48
45
 
49
46
  # check if a given command exists: return the path if it does, return false if not
50
47
  def command_exist?(command)
51
48
  basename = File.basename(command)
52
- path = ""
49
+ path = +""
53
50
  print "Checking #{basename}: "
54
- if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
55
- puts "detected [#{path}]"
56
- return path.strip
57
- elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
58
- puts "detected [#{path}]"
59
- return path.strip
60
- else
61
- puts "not found"
62
- return false
51
+ begin
52
+ if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
53
+ puts "detected [#{path}]"
54
+ path.strip
55
+ elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
56
+ puts "detected [#{path}]"
57
+ path.strip
58
+ else
59
+ puts "#{basename} not found"
60
+ false
61
+ end
62
+ rescue StandardError
63
+ puts "#{basename} not found"
64
+ false
63
65
  end
64
66
  end
65
67
 
@@ -67,28 +69,22 @@ module Wp2txt
67
69
  def prepare
68
70
  # if output_dir is not specified, output in the same directory
69
71
  # as the imput file
70
- if !@output_dir && @input_file
71
- @output_dir = File.dirname(@input_file)
72
- end
72
+ @output_dir = File.dirname(@input_file) if !@output_dir && @input_file
73
73
 
74
74
  if /.bz2$/ =~ @input_file
75
75
  if @bz2_gem
76
76
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
77
- elsif RUBY_PLATFORM.index("win32")
77
+ elsif Gem.win_platform?
78
78
  file = IO.popen("bunzip2.exe -c #{@input_file}")
79
- else
80
- if bzpath = command_exist?("lbzip2") ||
81
- command_exist?("pbzip2") ||
82
- command_exist?("bzip2")
83
- file = IO.popen("#{bzpath} -c -d #{@input_file}")
84
- end
79
+ elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
80
+ file = IO.popen("#{bzpath} -c -d #{@input_file}")
85
81
  end
86
82
  else # meaning that it is a text file
87
83
  @infile_size = File.stat(@input_file).size
88
84
  file = open(@input_file)
89
85
  end
90
86
 
91
- #create basename of output file
87
+ # create basename of output file
92
88
  @outfile_base = File.basename(@input_file, ".*") + "-"
93
89
  @total_size = 0
94
90
  @file_index = 1
@@ -97,15 +93,15 @@ module Wp2txt
97
93
  @outfiles << outfilename
98
94
  @fp = File.open(outfilename, "w")
99
95
  @file_pointer = file
100
- return true
96
+ true
101
97
  end
102
98
 
103
99
  # read text data from bz2 compressed file by 1 megabyte
104
100
  def fill_buffer
105
- while true do
101
+ loop do
106
102
  begin
107
- new_lines = @file_pointer.read(10485760)
108
- rescue => e
103
+ new_lines = @file_pointer.read(10_485_760)
104
+ rescue StandardError
109
105
  return nil
110
106
  end
111
107
  return nil unless new_lines
@@ -113,68 +109,58 @@ module Wp2txt
113
109
  # temp_buf is filled with text split by "\n"
114
110
  temp_buf = []
115
111
  ss = StringScanner.new(new_lines)
116
- while ss.scan(/.*?\n/m)
117
- temp_buf << ss[0]
118
- end
112
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
119
113
  temp_buf << ss.rest unless ss.eos?
120
114
 
121
115
  new_first_line = temp_buf.shift
122
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
123
- @buffer.last << new_first_line
124
- @buffer << ""
125
- else
126
- @buffer.last << new_first_line
127
- end
116
+ @buffer.last << new_first_line
117
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
128
118
  @buffer += temp_buf unless temp_buf.empty?
129
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
130
- @buffer << ""
131
- end
119
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
132
120
  break if @buffer.size > 1
133
121
  end
134
- return true
122
+ true
135
123
  end
136
124
 
137
125
  def get_newline
138
- @buffer ||= [""]
139
- if @buffer.size == 1
140
- return nil unless fill_buffer
141
- end
142
- if @buffer.empty?
143
- return nil
126
+ @buffer ||= [+""]
127
+ if @buffer.size == 1 && !fill_buffer
128
+ nil
129
+ elsif @buffer.empty?
130
+ nil
144
131
  else
145
- new_line = @buffer.shift
146
- return new_line
132
+ @buffer.shift
147
133
  end
148
134
  end
149
135
 
150
136
  def split_file
151
- output_text = ""
137
+ output_text = +""
152
138
  end_flag = false
153
- while text = get_newline
154
- @count ||= 0;@count += 1;
155
- @size_read ||=0
139
+ while (text = get_newline)
140
+ @count ||= 0
141
+ @count += 1
142
+ @size_read ||= 0
156
143
  @size_read += text.bytesize
157
144
  @total_size += text.bytesize
158
145
  output_text << text
159
146
  end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
160
147
  # never close the file until the end of the page even if end_flag is on
161
- if end_flag && /<\/page/ =~ text
162
- @fp.puts(output_text)
163
- output_text = ""
164
- @total_size = 0
165
- end_flag = false
166
- @fp.close
167
- @file_index += 1
168
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
169
- @outfiles << outfilename
170
- @fp = File.open(outfilename, "w")
171
- next
172
- end
148
+ next unless end_flag && %r{</page} =~ text
149
+
150
+ @fp.puts(output_text)
151
+ output_text = +""
152
+ @total_size = 0
153
+ end_flag = false
154
+ @fp.close
155
+ @file_index += 1
156
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
157
+ @outfiles << outfilename
158
+ @fp = File.open(outfilename, "w")
173
159
  end
174
160
  @fp.puts(output_text) if output_text != ""
175
161
  @fp.close
176
162
 
177
- if File.size(outfilename) == 0
163
+ if File.size(outfilename).zero?
178
164
  File.delete(outfilename)
179
165
  @outfiles.delete(outfilename)
180
166
  end
@@ -201,14 +187,14 @@ module Wp2txt
201
187
  @file_pointer = file
202
188
  @outfile_base = File.basename(@input_file, ".*")
203
189
  @total_size = 0
204
- return true
190
+ true
205
191
  end
206
192
 
207
193
  def fill_buffer
208
- while true do
194
+ loop do
209
195
  begin
210
- new_lines = @file_pointer.read(10485760)
211
- rescue => e
196
+ new_lines = @file_pointer.read(10_485_760)
197
+ rescue StandardError
212
198
  return nil
213
199
  end
214
200
  return nil unless new_lines
@@ -216,49 +202,40 @@ module Wp2txt
216
202
  # temp_buf is filled with text split by "\n"
217
203
  temp_buf = []
218
204
  ss = StringScanner.new(new_lines)
219
- while ss.scan(/.*?\n/m)
220
- temp_buf << ss[0]
221
- end
205
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
222
206
  temp_buf << ss.rest unless ss.eos?
223
207
 
224
208
  new_first_line = temp_buf.shift
225
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
226
- @buffer.last << new_first_line
227
- @buffer << ""
228
- else
229
- @buffer.last << new_first_line
230
- end
209
+ @buffer.last << new_first_line
210
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
231
211
  @buffer += temp_buf unless temp_buf.empty?
232
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
233
- @buffer << ""
234
- end
212
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
235
213
  break if @buffer.size > 1
236
214
  end
237
- return true
215
+ true
238
216
  end
239
217
 
240
218
  def get_newline
241
- @buffer ||= [""]
242
- if @buffer.size == 1
243
- return nil unless fill_buffer
244
- end
245
- if @buffer.empty?
246
- return nil
219
+ @buffer ||= [+""]
220
+ if @buffer.size == 1 && !fill_buffer
221
+ nil
222
+ elsif @buffer.empty?
223
+ nil
247
224
  else
248
- new_line = @buffer.shift
249
- return new_line
225
+ @buffer.shift
250
226
  end
251
227
  end
252
228
 
253
229
  def get_page
254
230
  inside_page = false
255
- page = ""
256
- while line = get_newline
257
- if /<page>/ =~ line #
231
+ page = +""
232
+ while (line = get_newline)
233
+ case line
234
+ when /<page>/
258
235
  page << line
259
236
  inside_page = true
260
237
  next
261
- elsif /<\/page>/ =~ line #
238
+ when %r{</page>}
262
239
  page << line
263
240
  inside_page = false
264
241
  break
@@ -266,65 +243,62 @@ module Wp2txt
266
243
  page << line if inside_page
267
244
  end
268
245
  if page.empty?
269
- return false
246
+ false
270
247
  else
271
- return page.force_encoding("utf-8") rescue page
248
+ page.force_encoding("utf-8")
272
249
  end
250
+ rescue StandardError
251
+ page
273
252
  end
274
253
 
275
254
  def extract_text(&block)
276
- in_text = false
277
- in_message = false
278
- result_text = ""
279
255
  title = nil
280
- end_flag = false
281
- terminal_round = false
282
- output_text = ""
256
+ output_text = +""
283
257
  pages = []
284
258
  data_empty = false
285
259
 
286
- while !data_empty
287
- page = get_page
288
- if page
289
- pages << page
260
+ until data_empty
261
+ new_page = get_page
262
+ if new_page
263
+ pages << new_page
290
264
  else
291
265
  data_empty = true
292
266
  end
293
- if data_empty
294
- pages.each do |page|
295
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
296
- xml = xmlns + page + "</mediawiki>"
297
-
298
- input = Nokogiri::XML(xml, nil, 'UTF-8')
299
- page = input.xpath("//xmlns:text").first
300
- pp_title = page.parent.parent.at_css "title"
301
- title = pp_title.content
302
- unless /\:/ =~ title
303
- text = page.content
304
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
305
- num_of_newlines = content.count("\n")
306
- if num_of_newlines == 0
307
- ""
308
- else
309
- "\n" * num_of_newlines
310
- end
311
- end
312
- article = Article.new(text, title, @strip_tmarker)
313
- page_text = block.call(article)
314
- output_text << page_text
267
+ next unless data_empty
268
+
269
+ pages.each do |page|
270
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
271
+ xml = xmlns + page + "</mediawiki>"
272
+
273
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
274
+ page = input.xpath("//xmlns:text").first
275
+ pp_title = page.parent.parent.at_css "title"
276
+ title = pp_title.content
277
+ next if /:/ =~ title
278
+
279
+ text = page.content
280
+ text.gsub!(/<!--(.*?)-->/m) do |content|
281
+ num_of_newlines = content.count("\n")
282
+ if num_of_newlines.zero?
283
+ +""
284
+ else
285
+ "\n" * num_of_newlines
315
286
  end
316
287
  end
288
+ article = Article.new(text, title, @strip_tmarker)
289
+ page_text = block.call(article)
290
+ output_text << page_text
291
+ end
317
292
 
318
- cleanup!(output_text)
319
- if output_text.size > 0
320
- outfilename = File.join(@output_dir, @outfile_base + ".txt")
321
- @fp = File.open(outfilename, "w")
322
- @fp.puts(output_text)
323
- @fp.close
324
- end
325
- File.delete(@input_file) if @del_interfile
326
- output_text = ""
293
+ output_text = cleanup(output_text)
294
+ unless output_text.empty?
295
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
296
+ @fp = File.open(outfilename, "w")
297
+ @fp.puts(output_text)
298
+ @fp.close
327
299
  end
300
+ File.delete(@input_file) if @del_interfile
301
+ output_text = +""
328
302
  end
329
303
  end
330
304
  end
data/spec/spec_helper.rb CHANGED
@@ -1,6 +1,6 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require 'rspec'
1
+ # frozen_string_literal: true
2
+
3
+ require "rspec"
3
4
 
4
5
  RSpec.configure do |config|
5
- # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
- end
6
+ end