wp2txt 0.7.8 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
4
- data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
3
+ metadata.gz: d0610b7e28e04c4cd9c3a1401c88e15f6ddb16ec
4
+ data.tar.gz: b866915631fdc956395c005735b089ddff7956e5
5
5
  SHA512:
6
- metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
7
- data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e
6
+ metadata.gz: e9fbef3de5ed866de0b3c7fadd96bdf0ff501b71c2d9f6f282eed538194bdfff8d9659cf53aedf95062c9aadf2ec90393075158ef9c8ae78f3e53ce84119f764
7
+ data.tar.gz: 36ad316986d94a6be89ccb591dec510dc4695bede188448fde0745702faad8039d0df4a84d2f0730dd11749a63d58e6b63b51b2bce585d8f8ccb3ff02553c3c8
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Wikipedia dump file to text converter
4
4
 
5
- **Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
5
+ **Important: This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution!**
6
6
 
7
7
  ### About ###
8
8
 
data/bin/wp2txt CHANGED
@@ -4,7 +4,7 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
  $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
6
 
7
- $DEBUG_MODE = false
7
+ DEBUG_MODE = true
8
8
  SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
9
9
  DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
10
10
 
@@ -76,16 +76,18 @@ wpconv.extract_text do |article|
76
76
  when :mw_heading
77
77
  next if !config[:heading]
78
78
  format_wiki!(e.last)
79
+ format_article!(e.last)
79
80
  line = e.last
80
81
  line << "+HEADING+" if $DEBUG_MODE
81
82
  when :mw_paragraph
82
83
  # next if !config[:paragraph]
83
84
  format_wiki!(e.last)
84
- line = e.last
85
+ format_article!(e.last)
86
+ line = e.last + "\n"
85
87
  line << "+PARAGRAPH+" if $DEBUG_MODE
86
88
  when :mw_table, :mw_htable
87
89
  next if !config[:table]
88
- format_wiki!(e.last)
90
+ # format_wiki!(e.last)
89
91
  line = e.last
90
92
  line << "+TABLE+" if $DEBUG_MODE
91
93
  when :mw_pre
@@ -94,23 +96,23 @@ wpconv.extract_text do |article|
94
96
  line << "+PRE+" if $DEBUG_MODE
95
97
  when :mw_quote
96
98
  # next if !config[:quote]
97
- format_wiki!(e.last)
99
+ # format_wiki!(e.last)
98
100
  line = e.last
99
101
  line << "+QUOTE+" if $DEBUG_MODE
100
102
  when :mw_unordered, :mw_ordered, :mw_definition
101
103
  next if !config[:list]
102
- format_wiki!(e.last)
104
+ # format_wiki!(e.last)
103
105
  line = e.last
104
106
  line << "+LIST+" if $DEBUG_MODE
105
107
  when :mw_redirect
106
108
  next if !config[:redirect]
107
- format_wiki!(e.last)
109
+ # format_wiki!(e.last)
108
110
  line = e.last
109
111
  line << "+REDIRECT+" if $DEBUG_MODE
110
112
  line << "\n\n"
111
113
  else
112
114
  if $DEBUG_MODE
113
- format_wiki!(e.last)
115
+ # format_wiki!(e.last)
114
116
  line = e.last
115
117
  line << "+OTHER+"
116
118
  else
@@ -119,8 +121,9 @@ wpconv.extract_text do |article|
119
121
  end
120
122
  contents << line
121
123
  end
122
- format_article!(contents)
123
124
  convert_characters!(contents)
125
+ remove_table!(contents) unless $leave_table
126
+ remove_ref!(contents) unless $leave_ref
124
127
 
125
128
  ##### cleanup #####
126
129
  if /\A\s*\z/m =~ contents
@@ -129,7 +132,6 @@ wpconv.extract_text do |article|
129
132
  result = config[:title] ? title + "\n" << contents : contents
130
133
  end
131
134
  result.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
132
- result.gsub!(/^[\s\W]+$/)
133
135
  result.gsub!(/\n\n\n+/m){"\n\n"}
134
136
  result << "\n"
135
137
  end
@@ -3,12 +3,7 @@
3
3
 
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
- # require "rubygems"
7
- # require "bundler/setup"
8
-
9
6
  require "nokogiri"
10
- # require "oga"
11
- # require "ox"
12
7
 
13
8
  require 'pp'
14
9
  require "wp2txt/article"
@@ -249,36 +244,6 @@ module Wp2txt
249
244
  next if /\:/ =~ title
250
245
  text = page.content
251
246
 
252
- # input = Oga.parse_xml(xml)
253
- # page = input.xpath("//xmlns:text").first
254
- # title = page.parent.parent.xpath("//xmlns:title").first.text
255
- # next if /\:/ =~ title
256
- # text = page.text
257
-
258
- # input = Ox.load(xml, :encoding => "UTF-8")
259
- # title = ""
260
- # text = ""
261
- # input.nodes.first.nodes.each do |n|
262
- # if n.name == "title"
263
- # title = n.nodes.first
264
- # if /\:/ =~ title
265
- # title = ""
266
- # break
267
- # end
268
- # elsif n.name == "revision"
269
- # n.nodes.each do |o|
270
- # if o.name == "text"
271
- # text = o.nodes.first
272
- # break
273
- # end
274
- # end
275
- # end
276
- # end
277
- # next if title == "" || text == ""
278
-
279
- # remove all comment texts
280
- # and insert as many number of new line chars included in
281
- # each comment instead
282
247
  text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
283
248
  num_of_newlines = content.count("\n")
284
249
  if num_of_newlines == 0
@@ -299,6 +264,7 @@ module Wp2txt
299
264
 
300
265
  #close the present file, then open a new one
301
266
  if end_flag
267
+ output_text.strip!
302
268
  @fp.puts(output_text)
303
269
  output_text = ""
304
270
  @total_size = 0
@@ -311,7 +277,10 @@ module Wp2txt
311
277
  next
312
278
  end
313
279
  end
314
- @fp.puts(output_text) if output_text != ""
280
+ if output_text != ""
281
+ output_text.strip!
282
+ @fp.puts(output_text)
283
+ end
315
284
  notify_parent(true)
316
285
  @parent.after
317
286
  @fp.close
@@ -103,7 +103,7 @@ module Wp2txt
103
103
  when $in_inputbox_regex
104
104
  @elements << create_element(:mw_inputbox, line)
105
105
  when $in_inputbox_regex1
106
- mode = :mw_inputbox
106
+ mode = :mw_inputbox
107
107
  @elements << create_element(:mw_inputbox, line)
108
108
  when $in_source_regex
109
109
  @elements << create_element(:mw_source, line)
@@ -129,7 +129,6 @@ module Wp2txt
129
129
  convert_characters!(text)
130
130
  correct_inline_template!(text) unless $leave_template
131
131
  remove_templates!(text) unless $leave_template
132
- remove_table!(text) unless $leave_table
133
132
  end
134
133
 
135
134
  #################### parser for nested structure ####################
@@ -295,16 +294,19 @@ module Wp2txt
295
294
  str.gsub!($mndash_regex, "–")
296
295
  end
297
296
 
298
- def remove_hr!(page)
299
- page.gsub!($remove_hr_regex, "")
297
+ def remove_hr!(str)
298
+ str.gsub!($remove_hr_regex, "")
300
299
  end
301
300
 
301
+ def remove_ref!(str)
302
+ str.gsub!($format_ref_regex){""}
303
+ end
304
+
302
305
  def make_reference!(str)
303
306
  str.gsub!($make_reference_regex_a){"\n"}
304
307
  str.gsub!($make_reference_regex_b){""}
305
308
  str.gsub!($make_reference_regex_c){"[ref]"}
306
309
  str.gsub!($make_reference_regex_d){"[/ref]"}
307
- str.gsub!($format_ref_regex){""} unless $leave_ref
308
310
  end
309
311
 
310
312
  def format_ref!(page)
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.8"
2
+ VERSION = "0.8.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.8
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
@@ -70,7 +70,6 @@ files:
70
70
  - bin/benchmark.rb
71
71
  - bin/wp2txt
72
72
  - data/testdata.bz2
73
- - error_log.txt
74
73
  - lib/wp2txt.rb
75
74
  - lib/wp2txt/article.rb
76
75
  - lib/wp2txt/mw_api.rb
@@ -1 +0,0 @@
1
- [[アンパサンド]]