wp2txt 0.7.8 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/wp2txt +11 -9
- data/lib/wp2txt.rb +5 -36
- data/lib/wp2txt/article.rb +1 -1
- data/lib/wp2txt/utils.rb +6 -4
- data/lib/wp2txt/version.rb +1 -1
- metadata +1 -2
- data/error_log.txt +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0610b7e28e04c4cd9c3a1401c88e15f6ddb16ec
|
4
|
+
data.tar.gz: b866915631fdc956395c005735b089ddff7956e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e9fbef3de5ed866de0b3c7fadd96bdf0ff501b71c2d9f6f282eed538194bdfff8d9659cf53aedf95062c9aadf2ec90393075158ef9c8ae78f3e53ce84119f764
|
7
|
+
data.tar.gz: 36ad316986d94a6be89ccb591dec510dc4695bede188448fde0745702faad8039d0df4a84d2f0730dd11749a63d58e6b63b51b2bce585d8f8ccb3ff02553c3c8
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Wikipedia dump file to text converter
|
4
4
|
|
5
|
-
**Important
|
5
|
+
**Important: This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution!**
|
6
6
|
|
7
7
|
### About ###
|
8
8
|
|
data/bin/wp2txt
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
6
|
|
7
|
-
|
7
|
+
DEBUG_MODE = true
|
8
8
|
SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
|
9
9
|
DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
|
10
10
|
|
@@ -76,16 +76,18 @@ wpconv.extract_text do |article|
|
|
76
76
|
when :mw_heading
|
77
77
|
next if !config[:heading]
|
78
78
|
format_wiki!(e.last)
|
79
|
+
format_article!(e.last)
|
79
80
|
line = e.last
|
80
81
|
line << "+HEADING+" if $DEBUG_MODE
|
81
82
|
when :mw_paragraph
|
82
83
|
# next if !config[:paragraph]
|
83
84
|
format_wiki!(e.last)
|
84
|
-
|
85
|
+
format_article!(e.last)
|
86
|
+
line = e.last + "\n"
|
85
87
|
line << "+PARAGRAPH+" if $DEBUG_MODE
|
86
88
|
when :mw_table, :mw_htable
|
87
89
|
next if !config[:table]
|
88
|
-
format_wiki!(e.last)
|
90
|
+
# format_wiki!(e.last)
|
89
91
|
line = e.last
|
90
92
|
line << "+TABLE+" if $DEBUG_MODE
|
91
93
|
when :mw_pre
|
@@ -94,23 +96,23 @@ wpconv.extract_text do |article|
|
|
94
96
|
line << "+PRE+" if $DEBUG_MODE
|
95
97
|
when :mw_quote
|
96
98
|
# next if !config[:quote]
|
97
|
-
format_wiki!(e.last)
|
99
|
+
# format_wiki!(e.last)
|
98
100
|
line = e.last
|
99
101
|
line << "+QUOTE+" if $DEBUG_MODE
|
100
102
|
when :mw_unordered, :mw_ordered, :mw_definition
|
101
103
|
next if !config[:list]
|
102
|
-
format_wiki!(e.last)
|
104
|
+
# format_wiki!(e.last)
|
103
105
|
line = e.last
|
104
106
|
line << "+LIST+" if $DEBUG_MODE
|
105
107
|
when :mw_redirect
|
106
108
|
next if !config[:redirect]
|
107
|
-
format_wiki!(e.last)
|
109
|
+
# format_wiki!(e.last)
|
108
110
|
line = e.last
|
109
111
|
line << "+REDIRECT+" if $DEBUG_MODE
|
110
112
|
line << "\n\n"
|
111
113
|
else
|
112
114
|
if $DEBUG_MODE
|
113
|
-
format_wiki!(e.last)
|
115
|
+
# format_wiki!(e.last)
|
114
116
|
line = e.last
|
115
117
|
line << "+OTHER+"
|
116
118
|
else
|
@@ -119,8 +121,9 @@ wpconv.extract_text do |article|
|
|
119
121
|
end
|
120
122
|
contents << line
|
121
123
|
end
|
122
|
-
format_article!(contents)
|
123
124
|
convert_characters!(contents)
|
125
|
+
remove_table!(contents) unless $leave_table
|
126
|
+
remove_ref!(contents) unless $leave_ref
|
124
127
|
|
125
128
|
##### cleanup #####
|
126
129
|
if /\A\s*\z/m =~ contents
|
@@ -129,7 +132,6 @@ wpconv.extract_text do |article|
|
|
129
132
|
result = config[:title] ? title + "\n" << contents : contents
|
130
133
|
end
|
131
134
|
result.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
132
|
-
result.gsub!(/^[\s\W]+$/)
|
133
135
|
result.gsub!(/\n\n\n+/m){"\n\n"}
|
134
136
|
result << "\n"
|
135
137
|
end
|
data/lib/wp2txt.rb
CHANGED
@@ -3,12 +3,7 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
-
# require "rubygems"
|
7
|
-
# require "bundler/setup"
|
8
|
-
|
9
6
|
require "nokogiri"
|
10
|
-
# require "oga"
|
11
|
-
# require "ox"
|
12
7
|
|
13
8
|
require 'pp'
|
14
9
|
require "wp2txt/article"
|
@@ -249,36 +244,6 @@ module Wp2txt
|
|
249
244
|
next if /\:/ =~ title
|
250
245
|
text = page.content
|
251
246
|
|
252
|
-
# input = Oga.parse_xml(xml)
|
253
|
-
# page = input.xpath("//xmlns:text").first
|
254
|
-
# title = page.parent.parent.xpath("//xmlns:title").first.text
|
255
|
-
# next if /\:/ =~ title
|
256
|
-
# text = page.text
|
257
|
-
|
258
|
-
# input = Ox.load(xml, :encoding => "UTF-8")
|
259
|
-
# title = ""
|
260
|
-
# text = ""
|
261
|
-
# input.nodes.first.nodes.each do |n|
|
262
|
-
# if n.name == "title"
|
263
|
-
# title = n.nodes.first
|
264
|
-
# if /\:/ =~ title
|
265
|
-
# title = ""
|
266
|
-
# break
|
267
|
-
# end
|
268
|
-
# elsif n.name == "revision"
|
269
|
-
# n.nodes.each do |o|
|
270
|
-
# if o.name == "text"
|
271
|
-
# text = o.nodes.first
|
272
|
-
# break
|
273
|
-
# end
|
274
|
-
# end
|
275
|
-
# end
|
276
|
-
# end
|
277
|
-
# next if title == "" || text == ""
|
278
|
-
|
279
|
-
# remove all comment texts
|
280
|
-
# and insert as many number of new line chars included in
|
281
|
-
# each comment instead
|
282
247
|
text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
|
283
248
|
num_of_newlines = content.count("\n")
|
284
249
|
if num_of_newlines == 0
|
@@ -299,6 +264,7 @@ module Wp2txt
|
|
299
264
|
|
300
265
|
#close the present file, then open a new one
|
301
266
|
if end_flag
|
267
|
+
output_text.strip!
|
302
268
|
@fp.puts(output_text)
|
303
269
|
output_text = ""
|
304
270
|
@total_size = 0
|
@@ -311,7 +277,10 @@ module Wp2txt
|
|
311
277
|
next
|
312
278
|
end
|
313
279
|
end
|
314
|
-
|
280
|
+
if output_text != ""
|
281
|
+
output_text.strip!
|
282
|
+
@fp.puts(output_text)
|
283
|
+
end
|
315
284
|
notify_parent(true)
|
316
285
|
@parent.after
|
317
286
|
@fp.close
|
data/lib/wp2txt/article.rb
CHANGED
@@ -103,7 +103,7 @@ module Wp2txt
|
|
103
103
|
when $in_inputbox_regex
|
104
104
|
@elements << create_element(:mw_inputbox, line)
|
105
105
|
when $in_inputbox_regex1
|
106
|
-
mode = :mw_inputbox
|
106
|
+
mode = :mw_inputbox
|
107
107
|
@elements << create_element(:mw_inputbox, line)
|
108
108
|
when $in_source_regex
|
109
109
|
@elements << create_element(:mw_source, line)
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -129,7 +129,6 @@ module Wp2txt
|
|
129
129
|
convert_characters!(text)
|
130
130
|
correct_inline_template!(text) unless $leave_template
|
131
131
|
remove_templates!(text) unless $leave_template
|
132
|
-
remove_table!(text) unless $leave_table
|
133
132
|
end
|
134
133
|
|
135
134
|
#################### parser for nested structure ####################
|
@@ -295,16 +294,19 @@ module Wp2txt
|
|
295
294
|
str.gsub!($mndash_regex, "–")
|
296
295
|
end
|
297
296
|
|
298
|
-
def remove_hr!(
|
299
|
-
|
297
|
+
def remove_hr!(str)
|
298
|
+
str.gsub!($remove_hr_regex, "")
|
300
299
|
end
|
301
300
|
|
301
|
+
def remove_ref!(str)
|
302
|
+
str.gsub!($format_ref_regex){""}
|
303
|
+
end
|
304
|
+
|
302
305
|
def make_reference!(str)
|
303
306
|
str.gsub!($make_reference_regex_a){"\n"}
|
304
307
|
str.gsub!($make_reference_regex_b){""}
|
305
308
|
str.gsub!($make_reference_regex_c){"[ref]"}
|
306
309
|
str.gsub!($make_reference_regex_d){"[/ref]"}
|
307
|
-
str.gsub!($format_ref_regex){""} unless $leave_ref
|
308
310
|
end
|
309
311
|
|
310
312
|
def format_ref!(page)
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
@@ -70,7 +70,6 @@ files:
|
|
70
70
|
- bin/benchmark.rb
|
71
71
|
- bin/wp2txt
|
72
72
|
- data/testdata.bz2
|
73
|
-
- error_log.txt
|
74
73
|
- lib/wp2txt.rb
|
75
74
|
- lib/wp2txt/article.rb
|
76
75
|
- lib/wp2txt/mw_api.rb
|
data/error_log.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
[[アンパサンド]]
|