wp2txt 0.7.8 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
4
- data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
3
+ metadata.gz: d0610b7e28e04c4cd9c3a1401c88e15f6ddb16ec
4
+ data.tar.gz: b866915631fdc956395c005735b089ddff7956e5
5
5
  SHA512:
6
- metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
7
- data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e
6
+ metadata.gz: e9fbef3de5ed866de0b3c7fadd96bdf0ff501b71c2d9f6f282eed538194bdfff8d9659cf53aedf95062c9aadf2ec90393075158ef9c8ae78f3e53ce84119f764
7
+ data.tar.gz: 36ad316986d94a6be89ccb591dec510dc4695bede188448fde0745702faad8039d0df4a84d2f0730dd11749a63d58e6b63b51b2bce585d8f8ccb3ff02553c3c8
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Wikipedia dump file to text converter
4
4
 
5
- **Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
5
+ **Important: This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution!**
6
6
 
7
7
  ### About ###
8
8
 
data/bin/wp2txt CHANGED
@@ -4,7 +4,7 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
  $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
6
 
7
- $DEBUG_MODE = false
7
+ DEBUG_MODE = true
8
8
  SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
9
9
  DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
10
10
 
@@ -76,16 +76,18 @@ wpconv.extract_text do |article|
76
76
  when :mw_heading
77
77
  next if !config[:heading]
78
78
  format_wiki!(e.last)
79
+ format_article!(e.last)
79
80
  line = e.last
80
81
  line << "+HEADING+" if $DEBUG_MODE
81
82
  when :mw_paragraph
82
83
  # next if !config[:paragraph]
83
84
  format_wiki!(e.last)
84
- line = e.last
85
+ format_article!(e.last)
86
+ line = e.last + "\n"
85
87
  line << "+PARAGRAPH+" if $DEBUG_MODE
86
88
  when :mw_table, :mw_htable
87
89
  next if !config[:table]
88
- format_wiki!(e.last)
90
+ # format_wiki!(e.last)
89
91
  line = e.last
90
92
  line << "+TABLE+" if $DEBUG_MODE
91
93
  when :mw_pre
@@ -94,23 +96,23 @@ wpconv.extract_text do |article|
94
96
  line << "+PRE+" if $DEBUG_MODE
95
97
  when :mw_quote
96
98
  # next if !config[:quote]
97
- format_wiki!(e.last)
99
+ # format_wiki!(e.last)
98
100
  line = e.last
99
101
  line << "+QUOTE+" if $DEBUG_MODE
100
102
  when :mw_unordered, :mw_ordered, :mw_definition
101
103
  next if !config[:list]
102
- format_wiki!(e.last)
104
+ # format_wiki!(e.last)
103
105
  line = e.last
104
106
  line << "+LIST+" if $DEBUG_MODE
105
107
  when :mw_redirect
106
108
  next if !config[:redirect]
107
- format_wiki!(e.last)
109
+ # format_wiki!(e.last)
108
110
  line = e.last
109
111
  line << "+REDIRECT+" if $DEBUG_MODE
110
112
  line << "\n\n"
111
113
  else
112
114
  if $DEBUG_MODE
113
- format_wiki!(e.last)
115
+ # format_wiki!(e.last)
114
116
  line = e.last
115
117
  line << "+OTHER+"
116
118
  else
@@ -119,8 +121,9 @@ wpconv.extract_text do |article|
119
121
  end
120
122
  contents << line
121
123
  end
122
- format_article!(contents)
123
124
  convert_characters!(contents)
125
+ remove_table!(contents) unless $leave_table
126
+ remove_ref!(contents) unless $leave_ref
124
127
 
125
128
  ##### cleanup #####
126
129
  if /\A\s*\z/m =~ contents
@@ -129,7 +132,6 @@ wpconv.extract_text do |article|
129
132
  result = config[:title] ? title + "\n" << contents : contents
130
133
  end
131
134
  result.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
132
- result.gsub!(/^[\s\W]+$/)
133
135
  result.gsub!(/\n\n\n+/m){"\n\n"}
134
136
  result << "\n"
135
137
  end
@@ -3,12 +3,7 @@
3
3
 
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
- # require "rubygems"
7
- # require "bundler/setup"
8
-
9
6
  require "nokogiri"
10
- # require "oga"
11
- # require "ox"
12
7
 
13
8
  require 'pp'
14
9
  require "wp2txt/article"
@@ -249,36 +244,6 @@ module Wp2txt
249
244
  next if /\:/ =~ title
250
245
  text = page.content
251
246
 
252
- # input = Oga.parse_xml(xml)
253
- # page = input.xpath("//xmlns:text").first
254
- # title = page.parent.parent.xpath("//xmlns:title").first.text
255
- # next if /\:/ =~ title
256
- # text = page.text
257
-
258
- # input = Ox.load(xml, :encoding => "UTF-8")
259
- # title = ""
260
- # text = ""
261
- # input.nodes.first.nodes.each do |n|
262
- # if n.name == "title"
263
- # title = n.nodes.first
264
- # if /\:/ =~ title
265
- # title = ""
266
- # break
267
- # end
268
- # elsif n.name == "revision"
269
- # n.nodes.each do |o|
270
- # if o.name == "text"
271
- # text = o.nodes.first
272
- # break
273
- # end
274
- # end
275
- # end
276
- # end
277
- # next if title == "" || text == ""
278
-
279
- # remove all comment texts
280
- # and insert as many number of new line chars included in
281
- # each comment instead
282
247
  text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
283
248
  num_of_newlines = content.count("\n")
284
249
  if num_of_newlines == 0
@@ -299,6 +264,7 @@ module Wp2txt
299
264
 
300
265
  #close the present file, then open a new one
301
266
  if end_flag
267
+ output_text.strip!
302
268
  @fp.puts(output_text)
303
269
  output_text = ""
304
270
  @total_size = 0
@@ -311,7 +277,10 @@ module Wp2txt
311
277
  next
312
278
  end
313
279
  end
314
- @fp.puts(output_text) if output_text != ""
280
+ if output_text != ""
281
+ output_text.strip!
282
+ @fp.puts(output_text)
283
+ end
315
284
  notify_parent(true)
316
285
  @parent.after
317
286
  @fp.close
@@ -103,7 +103,7 @@ module Wp2txt
103
103
  when $in_inputbox_regex
104
104
  @elements << create_element(:mw_inputbox, line)
105
105
  when $in_inputbox_regex1
106
- mode = :mw_inputbox
106
+ mode = :mw_inputbox
107
107
  @elements << create_element(:mw_inputbox, line)
108
108
  when $in_source_regex
109
109
  @elements << create_element(:mw_source, line)
@@ -129,7 +129,6 @@ module Wp2txt
129
129
  convert_characters!(text)
130
130
  correct_inline_template!(text) unless $leave_template
131
131
  remove_templates!(text) unless $leave_template
132
- remove_table!(text) unless $leave_table
133
132
  end
134
133
 
135
134
  #################### parser for nested structure ####################
@@ -295,16 +294,19 @@ module Wp2txt
295
294
  str.gsub!($mndash_regex, "–")
296
295
  end
297
296
 
298
- def remove_hr!(page)
299
- page.gsub!($remove_hr_regex, "")
297
+ def remove_hr!(str)
298
+ str.gsub!($remove_hr_regex, "")
300
299
  end
301
300
 
301
+ def remove_ref!(str)
302
+ str.gsub!($format_ref_regex){""}
303
+ end
304
+
302
305
  def make_reference!(str)
303
306
  str.gsub!($make_reference_regex_a){"\n"}
304
307
  str.gsub!($make_reference_regex_b){""}
305
308
  str.gsub!($make_reference_regex_c){"[ref]"}
306
309
  str.gsub!($make_reference_regex_d){"[/ref]"}
307
- str.gsub!($format_ref_regex){""} unless $leave_ref
308
310
  end
309
311
 
310
312
  def format_ref!(page)
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.8"
2
+ VERSION = "0.8.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.8
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
@@ -70,7 +70,6 @@ files:
70
70
  - bin/benchmark.rb
71
71
  - bin/wp2txt
72
72
  - data/testdata.bz2
73
- - error_log.txt
74
73
  - lib/wp2txt.rb
75
74
  - lib/wp2txt/article.rb
76
75
  - lib/wp2txt/mw_api.rb
@@ -1 +0,0 @@
1
- [[アンパサンド]]