wp2txt 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wp2txt.rb CHANGED
@@ -1,11 +1,8 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
1
+ # frozen_string_literal: true
5
2
 
6
3
  require "nokogiri"
7
- require "wp2txt/article"
8
- require "wp2txt/utils"
4
+ require_relative "wp2txt/article"
5
+ require_relative "wp2txt/utils"
9
6
 
10
7
  module Wp2txt
11
8
  class Splitter
@@ -15,51 +12,51 @@ module Wp2txt
15
12
  @input_file = input_file
16
13
  @output_dir = output_dir
17
14
  @tfile_size = tfile_size
18
- if bz2_gem
19
- require "bzip2-ruby"
20
- end
15
+ require "bzip2-ruby" if bz2_gem
21
16
  @bz2_gem = bz2_gem
22
17
  prepare
23
18
  end
24
19
 
25
20
  def file_size(file)
26
- origin = Time.now
27
- size = 0; unit = 10485760; star = 0; before = Time.now.to_f
28
- error_count = 10
29
- while true do
21
+ size = 0
22
+ unit = 10_485_760
23
+ star = 0
24
+ before = Time.now.to_f
25
+
26
+ loop do
30
27
  begin
31
28
  a = file.read(unit)
32
- rescue => e
29
+ rescue StandardError
33
30
  a = nil
34
31
  end
35
32
  break unless a
36
33
 
37
34
  present = Time.now.to_f
38
35
  size += a.size
39
- if present - before > 0.3
40
- star = 0 if star > 10
41
- star += 1
42
- before = present
43
- end
36
+
37
+ next if present - before <= 0.3
38
+
39
+ star = 0 if star > 10
40
+ star += 1
41
+ before = present
44
42
  end
45
- time_elapsed = Time.now - origin
46
43
  size
47
44
  end
48
45
 
49
46
  # check if a given command exists: return the path if it does, return false if not
50
47
  def command_exist?(command)
51
48
  basename = File.basename(command)
52
- path = ""
49
+ path = +""
53
50
  print "Checking #{basename}: "
54
- if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
51
+ if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
55
52
  puts "detected [#{path}]"
56
- return path.strip
57
- elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
53
+ path.strip
54
+ elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
58
55
  puts "detected [#{path}]"
59
- return path.strip
56
+ path.strip
60
57
  else
61
58
  puts "not found"
62
- return false
59
+ false
63
60
  end
64
61
  end
65
62
 
@@ -67,28 +64,22 @@ module Wp2txt
67
64
  def prepare
68
65
  # if output_dir is not specified, output in the same directory
69
66
  # as the imput file
70
- if !@output_dir && @input_file
71
- @output_dir = File.dirname(@input_file)
72
- end
67
+ @output_dir = File.dirname(@input_file) if !@output_dir && @input_file
73
68
 
74
69
  if /.bz2$/ =~ @input_file
75
70
  if @bz2_gem
76
71
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
77
72
  elsif RUBY_PLATFORM.index("win32")
78
73
  file = IO.popen("bunzip2.exe -c #{@input_file}")
79
- else
80
- if bzpath = command_exist?("lbzip2") ||
81
- command_exist?("pbzip2") ||
82
- command_exist?("bzip2")
83
- file = IO.popen("#{bzpath} -c -d #{@input_file}")
84
- end
74
+ elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
75
+ file = IO.popen("#{bzpath} -c -d #{@input_file}")
85
76
  end
86
77
  else # meaning that it is a text file
87
78
  @infile_size = File.stat(@input_file).size
88
79
  file = open(@input_file)
89
80
  end
90
81
 
91
- #create basename of output file
82
+ # create basename of output file
92
83
  @outfile_base = File.basename(@input_file, ".*") + "-"
93
84
  @total_size = 0
94
85
  @file_index = 1
@@ -97,15 +88,15 @@ module Wp2txt
97
88
  @outfiles << outfilename
98
89
  @fp = File.open(outfilename, "w")
99
90
  @file_pointer = file
100
- return true
91
+ true
101
92
  end
102
93
 
103
94
  # read text data from bz2 compressed file by 1 megabyte
104
95
  def fill_buffer
105
- while true do
96
+ loop do
106
97
  begin
107
- new_lines = @file_pointer.read(10485760)
108
- rescue => e
98
+ new_lines = @file_pointer.read(10_485_760)
99
+ rescue StandardError
109
100
  return nil
110
101
  end
111
102
  return nil unless new_lines
@@ -113,68 +104,58 @@ module Wp2txt
113
104
  # temp_buf is filled with text split by "\n"
114
105
  temp_buf = []
115
106
  ss = StringScanner.new(new_lines)
116
- while ss.scan(/.*?\n/m)
117
- temp_buf << ss[0]
118
- end
107
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
119
108
  temp_buf << ss.rest unless ss.eos?
120
109
 
121
110
  new_first_line = temp_buf.shift
122
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
123
- @buffer.last << new_first_line
124
- @buffer << ""
125
- else
126
- @buffer.last << new_first_line
127
- end
111
+ @buffer.last << new_first_line
112
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
128
113
  @buffer += temp_buf unless temp_buf.empty?
129
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
130
- @buffer << ""
131
- end
114
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
132
115
  break if @buffer.size > 1
133
116
  end
134
- return true
117
+ true
135
118
  end
136
119
 
137
120
  def get_newline
138
- @buffer ||= [""]
139
- if @buffer.size == 1
140
- return nil unless fill_buffer
141
- end
142
- if @buffer.empty?
143
- return nil
121
+ @buffer ||= [+""]
122
+ if @buffer.size == 1 && !fill_buffer
123
+ nil
124
+ elsif @buffer.empty?
125
+ nil
144
126
  else
145
- new_line = @buffer.shift
146
- return new_line
127
+ @buffer.shift
147
128
  end
148
129
  end
149
130
 
150
131
  def split_file
151
- output_text = ""
132
+ output_text = +""
152
133
  end_flag = false
153
- while text = get_newline
154
- @count ||= 0;@count += 1;
155
- @size_read ||=0
134
+ while (text = get_newline)
135
+ @count ||= 0
136
+ @count += 1
137
+ @size_read ||= 0
156
138
  @size_read += text.bytesize
157
139
  @total_size += text.bytesize
158
140
  output_text << text
159
141
  end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
160
142
  # never close the file until the end of the page even if end_flag is on
161
- if end_flag && /<\/page/ =~ text
162
- @fp.puts(output_text)
163
- output_text = ""
164
- @total_size = 0
165
- end_flag = false
166
- @fp.close
167
- @file_index += 1
168
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
169
- @outfiles << outfilename
170
- @fp = File.open(outfilename, "w")
171
- next
172
- end
143
+ next unless end_flag && %r{</page} =~ text
144
+
145
+ @fp.puts(output_text)
146
+ output_text = +""
147
+ @total_size = 0
148
+ end_flag = false
149
+ @fp.close
150
+ @file_index += 1
151
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
152
+ @outfiles << outfilename
153
+ @fp = File.open(outfilename, "w")
173
154
  end
174
155
  @fp.puts(output_text) if output_text != ""
175
156
  @fp.close
176
157
 
177
- if File.size(outfilename) == 0
158
+ if File.size(outfilename).zero?
178
159
  File.delete(outfilename)
179
160
  @outfiles.delete(outfilename)
180
161
  end
@@ -201,14 +182,14 @@ module Wp2txt
201
182
  @file_pointer = file
202
183
  @outfile_base = File.basename(@input_file, ".*")
203
184
  @total_size = 0
204
- return true
185
+ true
205
186
  end
206
187
 
207
188
  def fill_buffer
208
- while true do
189
+ loop do
209
190
  begin
210
- new_lines = @file_pointer.read(10485760)
211
- rescue => e
191
+ new_lines = @file_pointer.read(10_485_760)
192
+ rescue StandardError
212
193
  return nil
213
194
  end
214
195
  return nil unless new_lines
@@ -216,49 +197,40 @@ module Wp2txt
216
197
  # temp_buf is filled with text split by "\n"
217
198
  temp_buf = []
218
199
  ss = StringScanner.new(new_lines)
219
- while ss.scan(/.*?\n/m)
220
- temp_buf << ss[0]
221
- end
200
+ temp_buf << ss[0] while ss.scan(/.*?\n/m)
222
201
  temp_buf << ss.rest unless ss.eos?
223
202
 
224
203
  new_first_line = temp_buf.shift
225
- if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
226
- @buffer.last << new_first_line
227
- @buffer << ""
228
- else
229
- @buffer.last << new_first_line
230
- end
204
+ @buffer.last << new_first_line
205
+ @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
231
206
  @buffer += temp_buf unless temp_buf.empty?
232
- if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
233
- @buffer << ""
234
- end
207
+ @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
235
208
  break if @buffer.size > 1
236
209
  end
237
- return true
210
+ true
238
211
  end
239
212
 
240
213
  def get_newline
241
- @buffer ||= [""]
242
- if @buffer.size == 1
243
- return nil unless fill_buffer
244
- end
245
- if @buffer.empty?
246
- return nil
214
+ @buffer ||= [+""]
215
+ if @buffer.size == 1 && !fill_buffer
216
+ nil
217
+ elsif @buffer.empty?
218
+ nil
247
219
  else
248
- new_line = @buffer.shift
249
- return new_line
220
+ @buffer.shift
250
221
  end
251
222
  end
252
223
 
253
224
  def get_page
254
225
  inside_page = false
255
- page = ""
256
- while line = get_newline
257
- if /<page>/ =~ line #
226
+ page = +""
227
+ while (line = get_newline)
228
+ case line
229
+ when /<page>/
258
230
  page << line
259
231
  inside_page = true
260
232
  next
261
- elsif /<\/page>/ =~ line #
233
+ when %r{</page>}
262
234
  page << line
263
235
  inside_page = false
264
236
  break
@@ -266,65 +238,62 @@ module Wp2txt
266
238
  page << line if inside_page
267
239
  end
268
240
  if page.empty?
269
- return false
241
+ false
270
242
  else
271
- return page.force_encoding("utf-8") rescue page
243
+ page.force_encoding("utf-8")
272
244
  end
245
+ rescue StandardError
246
+ page
273
247
  end
274
248
 
275
249
  def extract_text(&block)
276
- in_text = false
277
- in_message = false
278
- result_text = ""
279
250
  title = nil
280
- end_flag = false
281
- terminal_round = false
282
- output_text = ""
251
+ output_text = +""
283
252
  pages = []
284
253
  data_empty = false
285
254
 
286
- while !data_empty
287
- page = get_page
288
- if page
289
- pages << page
255
+ until data_empty
256
+ new_page = get_page
257
+ if new_page
258
+ pages << new_page
290
259
  else
291
260
  data_empty = true
292
261
  end
293
- if data_empty
294
- pages.each do |page|
295
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
296
- xml = xmlns + page + "</mediawiki>"
297
-
298
- input = Nokogiri::XML(xml, nil, 'UTF-8')
299
- page = input.xpath("//xmlns:text").first
300
- pp_title = page.parent.parent.at_css "title"
301
- title = pp_title.content
302
- unless /\:/ =~ title
303
- text = page.content
304
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
305
- num_of_newlines = content.count("\n")
306
- if num_of_newlines == 0
307
- ""
308
- else
309
- "\n" * num_of_newlines
310
- end
311
- end
312
- article = Article.new(text, title, @strip_tmarker)
313
- page_text = block.call(article)
314
- output_text << page_text
262
+ next unless data_empty
263
+
264
+ pages.each do |page|
265
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
266
+ xml = xmlns + page + "</mediawiki>"
267
+
268
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
269
+ page = input.xpath("//xmlns:text").first
270
+ pp_title = page.parent.parent.at_css "title"
271
+ title = pp_title.content
272
+ next if /:/ =~ title
273
+
274
+ text = page.content
275
+ text.gsub!(/<!--(.*?)-->/m) do |content|
276
+ num_of_newlines = content.count("\n")
277
+ if num_of_newlines.zero?
278
+ +""
279
+ else
280
+ "\n" * num_of_newlines
315
281
  end
316
282
  end
283
+ article = Article.new(text, title, @strip_tmarker)
284
+ page_text = block.call(article)
285
+ output_text << page_text
286
+ end
317
287
 
318
- cleanup!(output_text)
319
- if output_text.size > 0
320
- outfilename = File.join(@output_dir, @outfile_base + ".txt")
321
- @fp = File.open(outfilename, "w")
322
- @fp.puts(output_text)
323
- @fp.close
324
- end
325
- File.delete(@input_file) if @del_interfile
326
- output_text = ""
288
+ output_text = cleanup(output_text)
289
+ unless output_text.empty?
290
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
291
+ @fp = File.open(outfilename, "w")
292
+ @fp.puts(output_text)
293
+ @fp.close
327
294
  end
295
+ File.delete(@input_file) if @del_interfile
296
+ output_text = +""
328
297
  end
329
298
  end
330
299
  end
data/spec/spec_helper.rb CHANGED
@@ -1,6 +1,6 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require 'rspec'
1
+ # frozen_string_literal: true
2
+
3
+ require "rspec"
3
4
 
4
5
  RSpec.configure do |config|
5
- # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
- end
6
+ end