wp2txt 0.7.8 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/wp2txt +11 -9
- data/lib/wp2txt.rb +5 -36
- data/lib/wp2txt/article.rb +1 -1
- data/lib/wp2txt/utils.rb +6 -4
- data/lib/wp2txt/version.rb +1 -1
- metadata +1 -2
- data/error_log.txt +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0610b7e28e04c4cd9c3a1401c88e15f6ddb16ec
|
4
|
+
data.tar.gz: b866915631fdc956395c005735b089ddff7956e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e9fbef3de5ed866de0b3c7fadd96bdf0ff501b71c2d9f6f282eed538194bdfff8d9659cf53aedf95062c9aadf2ec90393075158ef9c8ae78f3e53ce84119f764
|
7
|
+
data.tar.gz: 36ad316986d94a6be89ccb591dec510dc4695bede188448fde0745702faad8039d0df4a84d2f0730dd11749a63d58e6b63b51b2bce585d8f8ccb3ff02553c3c8
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Wikipedia dump file to text converter
|
4
4
|
|
5
|
-
**Important
|
5
|
+
**Important: This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution!**
|
6
6
|
|
7
7
|
### About ###
|
8
8
|
|
data/bin/wp2txt
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
6
|
|
7
|
-
|
7
|
+
DEBUG_MODE = true
|
8
8
|
SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
|
9
9
|
DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
|
10
10
|
|
@@ -76,16 +76,18 @@ wpconv.extract_text do |article|
|
|
76
76
|
when :mw_heading
|
77
77
|
next if !config[:heading]
|
78
78
|
format_wiki!(e.last)
|
79
|
+
format_article!(e.last)
|
79
80
|
line = e.last
|
80
81
|
line << "+HEADING+" if $DEBUG_MODE
|
81
82
|
when :mw_paragraph
|
82
83
|
# next if !config[:paragraph]
|
83
84
|
format_wiki!(e.last)
|
84
|
-
|
85
|
+
format_article!(e.last)
|
86
|
+
line = e.last + "\n"
|
85
87
|
line << "+PARAGRAPH+" if $DEBUG_MODE
|
86
88
|
when :mw_table, :mw_htable
|
87
89
|
next if !config[:table]
|
88
|
-
format_wiki!(e.last)
|
90
|
+
# format_wiki!(e.last)
|
89
91
|
line = e.last
|
90
92
|
line << "+TABLE+" if $DEBUG_MODE
|
91
93
|
when :mw_pre
|
@@ -94,23 +96,23 @@ wpconv.extract_text do |article|
|
|
94
96
|
line << "+PRE+" if $DEBUG_MODE
|
95
97
|
when :mw_quote
|
96
98
|
# next if !config[:quote]
|
97
|
-
format_wiki!(e.last)
|
99
|
+
# format_wiki!(e.last)
|
98
100
|
line = e.last
|
99
101
|
line << "+QUOTE+" if $DEBUG_MODE
|
100
102
|
when :mw_unordered, :mw_ordered, :mw_definition
|
101
103
|
next if !config[:list]
|
102
|
-
format_wiki!(e.last)
|
104
|
+
# format_wiki!(e.last)
|
103
105
|
line = e.last
|
104
106
|
line << "+LIST+" if $DEBUG_MODE
|
105
107
|
when :mw_redirect
|
106
108
|
next if !config[:redirect]
|
107
|
-
format_wiki!(e.last)
|
109
|
+
# format_wiki!(e.last)
|
108
110
|
line = e.last
|
109
111
|
line << "+REDIRECT+" if $DEBUG_MODE
|
110
112
|
line << "\n\n"
|
111
113
|
else
|
112
114
|
if $DEBUG_MODE
|
113
|
-
format_wiki!(e.last)
|
115
|
+
# format_wiki!(e.last)
|
114
116
|
line = e.last
|
115
117
|
line << "+OTHER+"
|
116
118
|
else
|
@@ -119,8 +121,9 @@ wpconv.extract_text do |article|
|
|
119
121
|
end
|
120
122
|
contents << line
|
121
123
|
end
|
122
|
-
format_article!(contents)
|
123
124
|
convert_characters!(contents)
|
125
|
+
remove_table!(contents) unless $leave_table
|
126
|
+
remove_ref!(contents) unless $leave_ref
|
124
127
|
|
125
128
|
##### cleanup #####
|
126
129
|
if /\A\s*\z/m =~ contents
|
@@ -129,7 +132,6 @@ wpconv.extract_text do |article|
|
|
129
132
|
result = config[:title] ? title + "\n" << contents : contents
|
130
133
|
end
|
131
134
|
result.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
132
|
-
result.gsub!(/^[\s\W]+$/)
|
133
135
|
result.gsub!(/\n\n\n+/m){"\n\n"}
|
134
136
|
result << "\n"
|
135
137
|
end
|
data/lib/wp2txt.rb
CHANGED
@@ -3,12 +3,7 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
-
# require "rubygems"
|
7
|
-
# require "bundler/setup"
|
8
|
-
|
9
6
|
require "nokogiri"
|
10
|
-
# require "oga"
|
11
|
-
# require "ox"
|
12
7
|
|
13
8
|
require 'pp'
|
14
9
|
require "wp2txt/article"
|
@@ -249,36 +244,6 @@ module Wp2txt
|
|
249
244
|
next if /\:/ =~ title
|
250
245
|
text = page.content
|
251
246
|
|
252
|
-
# input = Oga.parse_xml(xml)
|
253
|
-
# page = input.xpath("//xmlns:text").first
|
254
|
-
# title = page.parent.parent.xpath("//xmlns:title").first.text
|
255
|
-
# next if /\:/ =~ title
|
256
|
-
# text = page.text
|
257
|
-
|
258
|
-
# input = Ox.load(xml, :encoding => "UTF-8")
|
259
|
-
# title = ""
|
260
|
-
# text = ""
|
261
|
-
# input.nodes.first.nodes.each do |n|
|
262
|
-
# if n.name == "title"
|
263
|
-
# title = n.nodes.first
|
264
|
-
# if /\:/ =~ title
|
265
|
-
# title = ""
|
266
|
-
# break
|
267
|
-
# end
|
268
|
-
# elsif n.name == "revision"
|
269
|
-
# n.nodes.each do |o|
|
270
|
-
# if o.name == "text"
|
271
|
-
# text = o.nodes.first
|
272
|
-
# break
|
273
|
-
# end
|
274
|
-
# end
|
275
|
-
# end
|
276
|
-
# end
|
277
|
-
# next if title == "" || text == ""
|
278
|
-
|
279
|
-
# remove all comment texts
|
280
|
-
# and insert as many number of new line chars included in
|
281
|
-
# each comment instead
|
282
247
|
text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
|
283
248
|
num_of_newlines = content.count("\n")
|
284
249
|
if num_of_newlines == 0
|
@@ -299,6 +264,7 @@ module Wp2txt
|
|
299
264
|
|
300
265
|
#close the present file, then open a new one
|
301
266
|
if end_flag
|
267
|
+
output_text.strip!
|
302
268
|
@fp.puts(output_text)
|
303
269
|
output_text = ""
|
304
270
|
@total_size = 0
|
@@ -311,7 +277,10 @@ module Wp2txt
|
|
311
277
|
next
|
312
278
|
end
|
313
279
|
end
|
314
|
-
|
280
|
+
if output_text != ""
|
281
|
+
output_text.strip!
|
282
|
+
@fp.puts(output_text)
|
283
|
+
end
|
315
284
|
notify_parent(true)
|
316
285
|
@parent.after
|
317
286
|
@fp.close
|
data/lib/wp2txt/article.rb
CHANGED
@@ -103,7 +103,7 @@ module Wp2txt
|
|
103
103
|
when $in_inputbox_regex
|
104
104
|
@elements << create_element(:mw_inputbox, line)
|
105
105
|
when $in_inputbox_regex1
|
106
|
-
mode = :mw_inputbox
|
106
|
+
mode = :mw_inputbox
|
107
107
|
@elements << create_element(:mw_inputbox, line)
|
108
108
|
when $in_source_regex
|
109
109
|
@elements << create_element(:mw_source, line)
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -129,7 +129,6 @@ module Wp2txt
|
|
129
129
|
convert_characters!(text)
|
130
130
|
correct_inline_template!(text) unless $leave_template
|
131
131
|
remove_templates!(text) unless $leave_template
|
132
|
-
remove_table!(text) unless $leave_table
|
133
132
|
end
|
134
133
|
|
135
134
|
#################### parser for nested structure ####################
|
@@ -295,16 +294,19 @@ module Wp2txt
|
|
295
294
|
str.gsub!($mndash_regex, "–")
|
296
295
|
end
|
297
296
|
|
298
|
-
def remove_hr!(
|
299
|
-
|
297
|
+
def remove_hr!(str)
|
298
|
+
str.gsub!($remove_hr_regex, "")
|
300
299
|
end
|
301
300
|
|
301
|
+
def remove_ref!(str)
|
302
|
+
str.gsub!($format_ref_regex){""}
|
303
|
+
end
|
304
|
+
|
302
305
|
def make_reference!(str)
|
303
306
|
str.gsub!($make_reference_regex_a){"\n"}
|
304
307
|
str.gsub!($make_reference_regex_b){""}
|
305
308
|
str.gsub!($make_reference_regex_c){"[ref]"}
|
306
309
|
str.gsub!($make_reference_regex_d){"[/ref]"}
|
307
|
-
str.gsub!($format_ref_regex){""} unless $leave_ref
|
308
310
|
end
|
309
311
|
|
310
312
|
def format_ref!(page)
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
@@ -70,7 +70,6 @@ files:
|
|
70
70
|
- bin/benchmark.rb
|
71
71
|
- bin/wp2txt
|
72
72
|
- data/testdata.bz2
|
73
|
-
- error_log.txt
|
74
73
|
- lib/wp2txt.rb
|
75
74
|
- lib/wp2txt/article.rb
|
76
75
|
- lib/wp2txt/mw_api.rb
|
data/error_log.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
[[アンパサンド]]
|