wp2txt 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +65 -0
- data/Rakefile +9 -0
- data/bin/wp2txt +112 -0
- data/data/testdata.bz2 +0 -0
- data/lib/wp2txt.rb +323 -0
- data/lib/wp2txt/article.rb +177 -0
- data/lib/wp2txt/mw_api.rb +65 -0
- data/lib/wp2txt/progressbar.rb +305 -0
- data/lib/wp2txt/utils.rb +430 -0
- data/lib/wp2txt/version.rb +3 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/utils_spec.rb +195 -0
- data/wp2txt.gemspec +26 -0
- metadata +145 -0
@@ -0,0 +1,177 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
$: << File.join(File.dirname(__FILE__))
|
5
|
+
|
6
|
+
require 'strscan'
|
7
|
+
require 'utils'
|
8
|
+
|
9
|
+
module Wp2txt
|
10
|
+
|
11
|
+
# possible element type, which could be later chosen to print or not to print
|
12
|
+
# :mw_heading
|
13
|
+
# :mw_htable
|
14
|
+
# :mw_quote
|
15
|
+
# :mw_unordered
|
16
|
+
# :mw_ordered
|
17
|
+
# :mw_definition
|
18
|
+
# :mw_pre
|
19
|
+
# :mw_paragraph
|
20
|
+
# :mw_comment
|
21
|
+
# :mw_math
|
22
|
+
# :mw_source
|
23
|
+
# :mw_inputbox
|
24
|
+
# :mw_template
|
25
|
+
# :mw_link
|
26
|
+
# :mw_summary
|
27
|
+
# :mw_blank
|
28
|
+
# :mw_redirect
|
29
|
+
|
30
|
+
# an article contains elements, each of which is [TYPE, string]
|
31
|
+
class Article
|
32
|
+
|
33
|
+
include Wp2txt
|
34
|
+
attr_accessor :elements, :title
|
35
|
+
|
36
|
+
# class varialbes to save resource for generating regexps
|
37
|
+
# those with a trailing number 1 represent opening tag/markup
|
38
|
+
# those with a trailing number 2 represent closing tag/markup
|
39
|
+
# those without a trailing number contain both opening/closing tags/markups
|
40
|
+
|
41
|
+
@@in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
|
42
|
+
@@in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
43
|
+
|
44
|
+
@@in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
45
|
+
@@in_inputbox_regex1 = Regexp.new('<inputbox>')
|
46
|
+
@@in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
47
|
+
|
48
|
+
@@in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
49
|
+
@@in_source_regex1 = Regexp.new('<source.*?>')
|
50
|
+
@@in_source_regex2 = Regexp.new('<\/source>')
|
51
|
+
|
52
|
+
@@in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
53
|
+
@@in_math_regex1 = Regexp.new('<math.*?>')
|
54
|
+
@@in_math_regex2 = Regexp.new('<\/math>')
|
55
|
+
|
56
|
+
@@in_heading_regex = Regexp.new('^=+.*?=+$')
|
57
|
+
|
58
|
+
@@in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
59
|
+
@@in_html_table_regex1 = Regexp.new('<table\b')
|
60
|
+
@@in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
61
|
+
|
62
|
+
@@in_table_regex1 = Regexp.new('^\s*\{\|')
|
63
|
+
@@in_table_regex2 = Regexp.new('^\|\}.*?$')
|
64
|
+
|
65
|
+
@@in_unordered_regex = Regexp.new('^\*')
|
66
|
+
@@in_ordered_regex = Regexp.new('^\#')
|
67
|
+
@@in_pre_regex = Regexp.new('^ ')
|
68
|
+
@@in_definition_regex = Regexp.new('^[\;\:]')
|
69
|
+
|
70
|
+
@@blank_line_regex = Regexp.new('^\s*$')
|
71
|
+
|
72
|
+
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
+
|
74
|
+
def initialize(text, title = "", strip_tmarker = false)
|
75
|
+
@title = title.strip
|
76
|
+
@strip_tmarker = strip_tmarker
|
77
|
+
parse text
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_element(tp, text)
|
81
|
+
[tp, text]
|
82
|
+
end
|
83
|
+
|
84
|
+
def parse(source)
|
85
|
+
@elements = []
|
86
|
+
mode = nil
|
87
|
+
open_stack = []
|
88
|
+
close_stack = []
|
89
|
+
source.each_line do |line|
|
90
|
+
|
91
|
+
case mode
|
92
|
+
when :mw_table
|
93
|
+
if @@in_table_regex2 =~ line
|
94
|
+
mode = nil
|
95
|
+
end
|
96
|
+
@elements.last.last << line
|
97
|
+
next
|
98
|
+
when :mw_inputbox
|
99
|
+
if @@in_inputbox_regex2 =~ line
|
100
|
+
mode = nil
|
101
|
+
end
|
102
|
+
@elements.last.last << line
|
103
|
+
next
|
104
|
+
when :mw_source
|
105
|
+
if @@in_source_regex2 =~ line
|
106
|
+
mode = nil
|
107
|
+
end
|
108
|
+
@elements.last.last << line
|
109
|
+
next
|
110
|
+
when :mw_math
|
111
|
+
if @@in_math_regex2 =~ line
|
112
|
+
mode = nil
|
113
|
+
end
|
114
|
+
@elements.last.last << line
|
115
|
+
next
|
116
|
+
when :mw_htable
|
117
|
+
if @@in_html_table_regex2 =~ line
|
118
|
+
mode = nil
|
119
|
+
end
|
120
|
+
@elements.last.last << line
|
121
|
+
next
|
122
|
+
end
|
123
|
+
|
124
|
+
case line
|
125
|
+
when @@blank_line_regex
|
126
|
+
@elements << create_element(:mw_blank, "\n")
|
127
|
+
when @@redirect_regex
|
128
|
+
@elements << create_element(:mw_redirect, line)
|
129
|
+
when @@in_template_regex
|
130
|
+
@elements << create_element(:mw_template, line)
|
131
|
+
when @@in_heading_regex
|
132
|
+
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
133
|
+
when @@in_inputbox_regex
|
134
|
+
@elements << create_element(:mw_inputbox, line)
|
135
|
+
when @@in_inputbox_regex1
|
136
|
+
mode = :mw_inputbox
|
137
|
+
@elements << create_element(:mw_inputbox, line)
|
138
|
+
when @@in_source_regex
|
139
|
+
@elements << create_element(:mw_source, line)
|
140
|
+
when @@in_source_regex1
|
141
|
+
mode = :mw_source
|
142
|
+
@elements << create_element(:mw_source, line)
|
143
|
+
when @@in_math_regex
|
144
|
+
@elements << create_element(:mw_math, line)
|
145
|
+
when @@in_math_regex1
|
146
|
+
mode = :mw_math
|
147
|
+
@elements << create_element(:mw_math, line)
|
148
|
+
when @@in_html_table_regex
|
149
|
+
@elements << create_element(:mw_htable, line)
|
150
|
+
when @@in_html_table_regex1
|
151
|
+
mode = :mw_htable
|
152
|
+
@elements << create_element(:mw_htable, line)
|
153
|
+
when @@in_table_regex1
|
154
|
+
mode = :mw_table
|
155
|
+
@elements << create_element(:mw_table, line)
|
156
|
+
when @@in_unordered_regex
|
157
|
+
line = line.sub(/\A[\*\#\;\:\ ]+/, "") if @strip_tmarker
|
158
|
+
@elements << create_element(:mw_unordered, line)
|
159
|
+
when @@in_ordered_regex
|
160
|
+
line = line.sub(/\A[\*\#\;\:\ ]+/, "") if @strip_tmarker
|
161
|
+
@elements << create_element(:mw_ordered, line)
|
162
|
+
when @@in_pre_regex
|
163
|
+
line = line.sub(/\A\^\ /, "") if @strip_tmarker
|
164
|
+
@elements << create_element(:mw_pre, line)
|
165
|
+
when @@in_definition_regex
|
166
|
+
line = line.sub(/\A[\;\:\ ]+/, "") if @strip_tmarker
|
167
|
+
@elements << create_element(:mw_definition, line)
|
168
|
+
when @@in_link_regex
|
169
|
+
@elements << create_element(:mw_link, line)
|
170
|
+
else
|
171
|
+
@elements << create_element(:mw_paragraph, line)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
@elements
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
$: << File.join(File.dirname(__FILE__))
|
5
|
+
|
6
|
+
require 'uri'
|
7
|
+
require 'net/http'
|
8
|
+
require 'json'
|
9
|
+
require 'utils'
|
10
|
+
|
11
|
+
|
12
|
+
module Wp2txt
|
13
|
+
|
14
|
+
def post_request(uri_string, data={})
|
15
|
+
data = data.map{ |k, v| "#{k}=#{v}" }.join("&")
|
16
|
+
uri = URI.parse(uri_string)
|
17
|
+
uri.path = "/" if uri.path.empty?
|
18
|
+
http = Net::HTTP.new(uri.host)
|
19
|
+
return http.post(uri.path, data).body
|
20
|
+
end
|
21
|
+
|
22
|
+
def expand_template(uri, template, page)
|
23
|
+
text = URI.escape(template)
|
24
|
+
title = URI.escape(page)
|
25
|
+
data = {"action" => "expandtemplates",
|
26
|
+
"format" => "json",
|
27
|
+
"text" => text,
|
28
|
+
"title" => title}
|
29
|
+
jsn = post_request(uri, data)
|
30
|
+
hash = JSON.parse(jsn)
|
31
|
+
begin
|
32
|
+
result = hash["expandtemplates"]["*"]
|
33
|
+
result = special_chr(result)
|
34
|
+
return chrref_to_utf(result).gsub("{{", "{{").gsub("}}", "}}")
|
35
|
+
rescue => e
|
36
|
+
puts "ERROR!"
|
37
|
+
p e
|
38
|
+
exit
|
39
|
+
template
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse_wikitext(uri, wikitext, page)
|
44
|
+
text = URI.escape(wikitext)
|
45
|
+
title = URI.escape(page)
|
46
|
+
data = {"action" => "parse",
|
47
|
+
"format" => "json",
|
48
|
+
"text" => text,
|
49
|
+
"title" => title}
|
50
|
+
jsn = post_request(uri, data)
|
51
|
+
hash = JSON.parse(jsn)
|
52
|
+
begin
|
53
|
+
result = hash["parse"]["text"]["*"]
|
54
|
+
result = special_chr(result)
|
55
|
+
return chrref_to_utf(result).gsub("[[", "[[").gsub("]]", "]]")
|
56
|
+
rescue => e
|
57
|
+
puts "ERROR!"
|
58
|
+
p e
|
59
|
+
exit
|
60
|
+
template
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
@@ -0,0 +1,305 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#
|
4
|
+
# Ruby/ProgressBar - a text progress bar library
|
5
|
+
#
|
6
|
+
# Copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>
|
7
|
+
# All rights reserved.
|
8
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
9
|
+
#
|
10
|
+
# You can redistribute it and/or modify it under the terms
|
11
|
+
# of Ruby's license.
|
12
|
+
#
|
13
|
+
|
14
|
+
module Wp2txt
|
15
|
+
class ProgressBar
|
16
|
+
|
17
|
+
def initialize (title, total, out = STDERR)
|
18
|
+
@title = title
|
19
|
+
@total = total
|
20
|
+
@out = out
|
21
|
+
@terminal_width = 80
|
22
|
+
@bar_mark = "o"
|
23
|
+
@current = 0
|
24
|
+
@previous = 0
|
25
|
+
@finished_p = false
|
26
|
+
@start_time = Time.now
|
27
|
+
@previous_time = @start_time
|
28
|
+
@title_width = 14
|
29
|
+
@format = "%-#{@title_width}s %3d%% %s %s"
|
30
|
+
@format_arguments = [:title, :percentage, :bar, :stat]
|
31
|
+
clear
|
32
|
+
show
|
33
|
+
end
|
34
|
+
attr_reader :title
|
35
|
+
attr_reader :current
|
36
|
+
attr_reader :total
|
37
|
+
attr_accessor :start_time
|
38
|
+
|
39
|
+
private
|
40
|
+
def fmt_bar
|
41
|
+
bar_width = do_percentage * @terminal_width / 100
|
42
|
+
sprintf("|%s%s|",
|
43
|
+
@bar_mark * bar_width,
|
44
|
+
" " * (@terminal_width - bar_width))
|
45
|
+
end
|
46
|
+
|
47
|
+
def fmt_percentage
|
48
|
+
do_percentage
|
49
|
+
end
|
50
|
+
|
51
|
+
def fmt_stat
|
52
|
+
if @finished_p then elapsed else eta end
|
53
|
+
end
|
54
|
+
|
55
|
+
def fmt_stat_for_file_transfer
|
56
|
+
if @finished_p then
|
57
|
+
sprintf("%s %s %s", bytes, transfer_rate, elapsed)
|
58
|
+
else
|
59
|
+
sprintf("%s %s %s", bytes, transfer_rate, eta)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def fmt_title
|
64
|
+
@title[0,(@title_width - 1)] + ":"
|
65
|
+
end
|
66
|
+
|
67
|
+
def convert_bytes (bytes)
|
68
|
+
if bytes < 1024
|
69
|
+
sprintf("%6dB", bytes)
|
70
|
+
elsif bytes < 1024 * 1000 # 1000kb
|
71
|
+
sprintf("%5.1fKB", bytes.to_f / 1024)
|
72
|
+
elsif bytes < 1024 * 1024 * 1000 # 1000mb
|
73
|
+
sprintf("%5.1fMB", bytes.to_f / 1024 / 1024)
|
74
|
+
else
|
75
|
+
sprintf("%5.1fGB", bytes.to_f / 1024 / 1024 / 1024)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def transfer_rate
|
80
|
+
bytes_per_second = @current.to_f / (Time.now - @start_time)
|
81
|
+
sprintf("%s/s", convert_bytes(bytes_per_second))
|
82
|
+
end
|
83
|
+
|
84
|
+
def bytes
|
85
|
+
convert_bytes(@current)
|
86
|
+
end
|
87
|
+
|
88
|
+
def format_time (t)
|
89
|
+
t = t.to_i
|
90
|
+
sec = t % 60
|
91
|
+
min = (t / 60) % 60
|
92
|
+
hour = t / 3600
|
93
|
+
sprintf("%02d:%02d:%02d", hour, min, sec);
|
94
|
+
end
|
95
|
+
|
96
|
+
# ETA stands for Estimated Time of Arrival.
|
97
|
+
def eta
|
98
|
+
if @current == 0
|
99
|
+
"ETA: --:--:--"
|
100
|
+
else
|
101
|
+
elapsed = Time.now - @start_time
|
102
|
+
eta = elapsed * @total / @current - elapsed;
|
103
|
+
sprintf("ETA: %s", format_time(eta))
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def elapsed
|
108
|
+
elapsed = Time.now - @start_time
|
109
|
+
sprintf("Time: %s", format_time(elapsed))
|
110
|
+
end
|
111
|
+
|
112
|
+
def eol
|
113
|
+
if @finished_p then "\n" else "\r" end
|
114
|
+
end
|
115
|
+
|
116
|
+
def do_percentage
|
117
|
+
if @total.zero?
|
118
|
+
100
|
119
|
+
else
|
120
|
+
@current * 100 / @total
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def get_width
|
125
|
+
# FIXME: I don't know how portable it is.
|
126
|
+
default_width = 80
|
127
|
+
begin
|
128
|
+
tiocgwinsz = 0x5413
|
129
|
+
data = [0, 0, 0, 0].pack("SSSS")
|
130
|
+
if @out.ioctl(tiocgwinsz, data) >= 0 then
|
131
|
+
rows, cols, xpixels, ypixels = data.unpack("SSSS")
|
132
|
+
if cols >= 0 then cols else default_width end
|
133
|
+
else
|
134
|
+
default_width
|
135
|
+
end
|
136
|
+
rescue Exception
|
137
|
+
default_width
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def show
|
142
|
+
arguments = @format_arguments.map {|method|
|
143
|
+
method = sprintf("fmt_%s", method)
|
144
|
+
send(method)
|
145
|
+
}
|
146
|
+
line = sprintf(@format, *arguments)
|
147
|
+
|
148
|
+
width = get_width
|
149
|
+
if line.length == width - 1
|
150
|
+
@out.print(line + eol)
|
151
|
+
@out.flush
|
152
|
+
elsif line.length >= width
|
153
|
+
@terminal_width = [@terminal_width - (line.length - width + 1), 0].max
|
154
|
+
if @terminal_width == 0 then @out.print(line + eol) else show end
|
155
|
+
else # line.length < width - 1
|
156
|
+
@terminal_width += width - line.length + 1
|
157
|
+
show
|
158
|
+
end
|
159
|
+
@previous_time = Time.now
|
160
|
+
end
|
161
|
+
|
162
|
+
def show_if_needed
|
163
|
+
if @total.zero?
|
164
|
+
cur_percentage = 100
|
165
|
+
prev_percentage = 0
|
166
|
+
else
|
167
|
+
cur_percentage = (@current * 100 / @total).to_i
|
168
|
+
prev_percentage = (@previous * 100 / @total).to_i
|
169
|
+
end
|
170
|
+
|
171
|
+
# Use "!=" instead of ">" to support negative changes
|
172
|
+
if cur_percentage != prev_percentage ||
|
173
|
+
Time.now - @previous_time >= 1 || @finished_p
|
174
|
+
show
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
public
|
179
|
+
def clear
|
180
|
+
@out.print "\r"
|
181
|
+
@out.print(" " * (get_width - 1))
|
182
|
+
@out.print "\r"
|
183
|
+
end
|
184
|
+
|
185
|
+
def finish
|
186
|
+
@current = @total
|
187
|
+
@finished_p = true
|
188
|
+
show
|
189
|
+
end
|
190
|
+
|
191
|
+
def finished?
|
192
|
+
@finished_p
|
193
|
+
end
|
194
|
+
|
195
|
+
def file_transfer_mode
|
196
|
+
@format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer]
|
197
|
+
end
|
198
|
+
|
199
|
+
def format= (format)
|
200
|
+
@format = format
|
201
|
+
end
|
202
|
+
|
203
|
+
def format_arguments= (arguments)
|
204
|
+
@format_arguments = arguments
|
205
|
+
end
|
206
|
+
|
207
|
+
def halt
|
208
|
+
@finished_p = true
|
209
|
+
show
|
210
|
+
end
|
211
|
+
|
212
|
+
def inc (step = 1)
|
213
|
+
@current += step
|
214
|
+
@current = @total if @current > @total
|
215
|
+
show_if_needed
|
216
|
+
@previous = @current
|
217
|
+
end
|
218
|
+
|
219
|
+
def set (count)
|
220
|
+
if count < 0 || count > @total
|
221
|
+
raise "invalid count: #{count} (total: #{@total})"
|
222
|
+
end
|
223
|
+
@current = count
|
224
|
+
show_if_needed
|
225
|
+
@previous = @current
|
226
|
+
end
|
227
|
+
|
228
|
+
def inspect
|
229
|
+
"#<ProgressBar:#{@current}/#{@total}>"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
class ReversedProgressBar < ProgressBar
|
234
|
+
def do_percentage
|
235
|
+
100 - super
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
##########################################
|
240
|
+
|
241
|
+
class NewProgressBar < ProgressBar
|
242
|
+
|
243
|
+
attr_accessor :nhits, :nsentences
|
244
|
+
|
245
|
+
alias org_initialize initialize
|
246
|
+
|
247
|
+
def initialize(filename, size)
|
248
|
+
@nhits = 0
|
249
|
+
@nsentences = 0
|
250
|
+
org_initialize(File.basename(filename), size)
|
251
|
+
end
|
252
|
+
|
253
|
+
alias org_fmt_stat fmt_stat
|
254
|
+
|
255
|
+
def fmt_stat
|
256
|
+
org_fmt_stat # + " Hits: " + @nhits.to_s
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
class CmdProgbar
|
261
|
+
|
262
|
+
attr_accessor :pbar, :last_value
|
263
|
+
|
264
|
+
def initialize
|
265
|
+
@last_value = 0
|
266
|
+
@pbar = nil
|
267
|
+
end
|
268
|
+
|
269
|
+
def msg(str, i = nil)
|
270
|
+
case i
|
271
|
+
when 0
|
272
|
+
print str
|
273
|
+
else
|
274
|
+
puts str
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
def prg_update(value, elt, eta)
|
279
|
+
@elt = elt
|
280
|
+
@eta = eta
|
281
|
+
offset = value - @last_value
|
282
|
+
@pbar.inc(offset.to_i)
|
283
|
+
@last_value = value
|
284
|
+
end
|
285
|
+
|
286
|
+
def data_set(filename, linesize)
|
287
|
+
@pbar = NewProgressBar.new(filename, linesize)
|
288
|
+
end
|
289
|
+
|
290
|
+
def data_update(nhits, nsentences)
|
291
|
+
@pbar.nhits = nhits
|
292
|
+
@pbar.nsentences = nsentences
|
293
|
+
end
|
294
|
+
|
295
|
+
def before
|
296
|
+
initialize
|
297
|
+
end
|
298
|
+
|
299
|
+
def after
|
300
|
+
@pbar.finish
|
301
|
+
return true
|
302
|
+
end
|
303
|
+
|
304
|
+
end
|
305
|
+
end
|