wp2txt 0.9.5.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +99 -58
- data/bin/wp2txt +143 -95
- data/data/output_samples/testdata_en.txt +171 -1247
- data/data/output_samples/{testdata_en_categories.txt → testdata_en_category.txt} +1 -1
- data/data/output_samples/testdata_en_summary.txt +28 -20
- data/data/output_samples/testdata_ja.txt +10359 -17093
- data/data/output_samples/{testdata_ja_categories.txt → testdata_ja_category.txt} +30 -30
- data/data/output_samples/testdata_ja_summary.txt +36 -160
- data/image/screenshot.png +0 -0
- data/image/wp2txt-logo.svg +16 -0
- data/image/wp2txt.svg +31 -0
- data/lib/wp2txt/article.rb +1 -3
- data/lib/wp2txt/utils.rb +48 -24
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +118 -148
- data/spec/utils_spec.rb +3 -21
- data/wp2txt.gemspec +4 -0
- metadata +50 -9
- data/bin/benchmark.rb +0 -76
- data/lib/wp2txt/mw_api.rb +0 -65
- data/lib/wp2txt/progressbar.rb +0 -305
data/bin/benchmark.rb
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
-
|
7
|
-
require 'wp2txt'
|
8
|
-
require 'wp2txt/utils'
|
9
|
-
include Wp2txt
|
10
|
-
require 'benchmark'
|
11
|
-
|
12
|
-
data_dir = File.join(File.dirname(__FILE__), '..', "data")
|
13
|
-
|
14
|
-
parent = Wp2txt::CmdProgbar.new
|
15
|
-
input_file = File.join(data_dir, "testdata_ja.bz2")
|
16
|
-
output_dir = data_dir
|
17
|
-
tfile_size = 10
|
18
|
-
num_threads = 1
|
19
|
-
convert = true
|
20
|
-
strip_tmarker = true
|
21
|
-
|
22
|
-
Benchmark.bm do |x|
|
23
|
-
x.report do
|
24
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
|
25
|
-
wpconv.extract_text do |article|
|
26
|
-
format_wiki!(article.title)
|
27
|
-
title = "[[#{article.title}]]\n"
|
28
|
-
convert_characters!(title)
|
29
|
-
|
30
|
-
contents = "\nCATEGORIES: "
|
31
|
-
contents += article.categories.join(", ")
|
32
|
-
contents += "\n\n"
|
33
|
-
|
34
|
-
article.elements.each do |e|
|
35
|
-
case e.first
|
36
|
-
when :mw_heading
|
37
|
-
format_wiki!(e.last)
|
38
|
-
line = e.last
|
39
|
-
when :mw_paragraph
|
40
|
-
format_wiki!(e.last)
|
41
|
-
line = e.last
|
42
|
-
when :mw_table, :mw_htable
|
43
|
-
format_wiki!(e.last)
|
44
|
-
line = e.last
|
45
|
-
when :mw_pre
|
46
|
-
line = e.last
|
47
|
-
when :mw_quote
|
48
|
-
format_wiki!(e.last)
|
49
|
-
line = e.last
|
50
|
-
when :mw_unordered, :mw_ordered, :mw_definition
|
51
|
-
format_wiki!(e.last)
|
52
|
-
line = e.last
|
53
|
-
when :mw_redirect
|
54
|
-
format_wiki!(e.last)
|
55
|
-
line = e.last
|
56
|
-
line += "\n\n"
|
57
|
-
else
|
58
|
-
next
|
59
|
-
end
|
60
|
-
contents << line
|
61
|
-
end
|
62
|
-
format_wiki!(contents)
|
63
|
-
convert_characters!(contents)
|
64
|
-
|
65
|
-
##### cleanup #####
|
66
|
-
if /\A[\s ]*\z/m =~ contents
|
67
|
-
result = ""
|
68
|
-
else
|
69
|
-
result = title + "\n" + contents
|
70
|
-
end
|
71
|
-
result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
|
72
|
-
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
data/lib/wp2txt/mw_api.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
|
6
|
-
require 'uri'
|
7
|
-
require 'net/http'
|
8
|
-
require 'json'
|
9
|
-
require 'utils'
|
10
|
-
|
11
|
-
|
12
|
-
module Wp2txt
|
13
|
-
|
14
|
-
def post_request(uri_string, data={})
|
15
|
-
data = data.map{ |k, v| "#{k}=#{v}" }.join("&")
|
16
|
-
uri = URI.parse(uri_string)
|
17
|
-
uri.path = "/" if uri.path.empty?
|
18
|
-
http = Net::HTTP.new(uri.host)
|
19
|
-
return http.post(uri.path, data).body
|
20
|
-
end
|
21
|
-
|
22
|
-
def expand_template(uri, template, page)
|
23
|
-
text = URI.escape(template)
|
24
|
-
title = URI.escape(page)
|
25
|
-
data = {"action" => "expandtemplates",
|
26
|
-
"format" => "json",
|
27
|
-
"text" => text,
|
28
|
-
"title" => title}
|
29
|
-
jsn = post_request(uri, data)
|
30
|
-
hash = JSON.parse(jsn)
|
31
|
-
begin
|
32
|
-
result = hash["expandtemplates"]["*"]
|
33
|
-
result = special_chr(result)
|
34
|
-
return chrref_to_utf(result).gsub("{{", "{{").gsub("}}", "}}")
|
35
|
-
rescue => e
|
36
|
-
puts "ERROR!"
|
37
|
-
p e
|
38
|
-
exit
|
39
|
-
template
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def parse_wikitext(uri, wikitext, page)
|
44
|
-
text = URI.escape(wikitext)
|
45
|
-
title = URI.escape(page)
|
46
|
-
data = {"action" => "parse",
|
47
|
-
"format" => "json",
|
48
|
-
"text" => text,
|
49
|
-
"title" => title}
|
50
|
-
jsn = post_request(uri, data)
|
51
|
-
hash = JSON.parse(jsn)
|
52
|
-
begin
|
53
|
-
result = hash["parse"]["text"]["*"]
|
54
|
-
result = special_chr(result)
|
55
|
-
return chrref_to_utf(result).gsub("[[", "[[").gsub("]]", "]]")
|
56
|
-
rescue => e
|
57
|
-
puts "ERROR!"
|
58
|
-
p e
|
59
|
-
exit
|
60
|
-
template
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
|
data/lib/wp2txt/progressbar.rb
DELETED
@@ -1,305 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
#
|
4
|
-
# Ruby/ProgressBar - a text progress bar library
|
5
|
-
#
|
6
|
-
# Copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>
|
7
|
-
# All rights reserved.
|
8
|
-
# This is free software with ABSOLUTELY NO WARRANTY.
|
9
|
-
#
|
10
|
-
# You can redistribute it and/or modify it under the terms
|
11
|
-
# of Ruby's license.
|
12
|
-
#
|
13
|
-
|
14
|
-
module Wp2txt
|
15
|
-
class ProgressBar
|
16
|
-
|
17
|
-
def initialize (title, total, out = STDERR)
|
18
|
-
@title = title
|
19
|
-
@total = total
|
20
|
-
@out = out
|
21
|
-
@terminal_width = 80
|
22
|
-
@bar_mark = "o"
|
23
|
-
@current = 0
|
24
|
-
@previous = 0
|
25
|
-
@finished_p = false
|
26
|
-
@start_time = Time.now
|
27
|
-
@previous_time = @start_time
|
28
|
-
@title_width = 14
|
29
|
-
@format = "%-#{@title_width}s %3d%% %s %s"
|
30
|
-
@format_arguments = [:title, :percentage, :bar, :stat]
|
31
|
-
clear
|
32
|
-
show
|
33
|
-
end
|
34
|
-
attr_reader :title
|
35
|
-
attr_reader :current
|
36
|
-
attr_reader :total
|
37
|
-
attr_accessor :start_time
|
38
|
-
|
39
|
-
private
|
40
|
-
def fmt_bar
|
41
|
-
bar_width = do_percentage * @terminal_width / 100
|
42
|
-
sprintf("|%s%s|",
|
43
|
-
@bar_mark * bar_width,
|
44
|
-
" " * (@terminal_width - bar_width))
|
45
|
-
end
|
46
|
-
|
47
|
-
def fmt_percentage
|
48
|
-
do_percentage
|
49
|
-
end
|
50
|
-
|
51
|
-
def fmt_stat
|
52
|
-
if @finished_p then elapsed else eta end
|
53
|
-
end
|
54
|
-
|
55
|
-
def fmt_stat_for_file_transfer
|
56
|
-
if @finished_p then
|
57
|
-
sprintf("%s %s %s", bytes, transfer_rate, elapsed)
|
58
|
-
else
|
59
|
-
sprintf("%s %s %s", bytes, transfer_rate, eta)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
def fmt_title
|
64
|
-
@title[0,(@title_width - 1)] + ":"
|
65
|
-
end
|
66
|
-
|
67
|
-
def convert_bytes (bytes)
|
68
|
-
if bytes < 1024
|
69
|
-
sprintf("%6dB", bytes)
|
70
|
-
elsif bytes < 1024 * 1000 # 1000kb
|
71
|
-
sprintf("%5.1fKB", bytes.to_f / 1024)
|
72
|
-
elsif bytes < 1024 * 1024 * 1000 # 1000mb
|
73
|
-
sprintf("%5.1fMB", bytes.to_f / 1024 / 1024)
|
74
|
-
else
|
75
|
-
sprintf("%5.1fGB", bytes.to_f / 1024 / 1024 / 1024)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def transfer_rate
|
80
|
-
bytes_per_second = @current.to_f / (Time.now - @start_time)
|
81
|
-
sprintf("%s/s", convert_bytes(bytes_per_second))
|
82
|
-
end
|
83
|
-
|
84
|
-
def bytes
|
85
|
-
convert_bytes(@current)
|
86
|
-
end
|
87
|
-
|
88
|
-
def format_time (t)
|
89
|
-
t = t.to_i
|
90
|
-
sec = t % 60
|
91
|
-
min = (t / 60) % 60
|
92
|
-
hour = t / 3600
|
93
|
-
sprintf("%02d:%02d:%02d", hour, min, sec);
|
94
|
-
end
|
95
|
-
|
96
|
-
# ETA stands for Estimated Time of Arrival.
|
97
|
-
def eta
|
98
|
-
if @current == 0
|
99
|
-
"ETA: --:--:--"
|
100
|
-
else
|
101
|
-
elapsed = Time.now - @start_time
|
102
|
-
eta = elapsed * @total / @current - elapsed;
|
103
|
-
sprintf("ETA: %s", format_time(eta))
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def elapsed
|
108
|
-
elapsed = Time.now - @start_time
|
109
|
-
sprintf("Time: %s", format_time(elapsed))
|
110
|
-
end
|
111
|
-
|
112
|
-
def eol
|
113
|
-
if @finished_p then "\n" else "\r" end
|
114
|
-
end
|
115
|
-
|
116
|
-
def do_percentage
|
117
|
-
if @total.zero?
|
118
|
-
100
|
119
|
-
else
|
120
|
-
@current * 100 / @total
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
def get_width
|
125
|
-
# FIXME: I don't know how portable it is.
|
126
|
-
default_width = 80
|
127
|
-
begin
|
128
|
-
tiocgwinsz = 0x5413
|
129
|
-
data = [0, 0, 0, 0].pack("SSSS")
|
130
|
-
if @out.ioctl(tiocgwinsz, data) >= 0 then
|
131
|
-
rows, cols, xpixels, ypixels = data.unpack("SSSS")
|
132
|
-
if cols >= 0 then cols else default_width end
|
133
|
-
else
|
134
|
-
default_width
|
135
|
-
end
|
136
|
-
rescue Exception
|
137
|
-
default_width
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def show
|
142
|
-
arguments = @format_arguments.map {|method|
|
143
|
-
method = sprintf("fmt_%s", method)
|
144
|
-
send(method)
|
145
|
-
}
|
146
|
-
line = sprintf(@format, *arguments)
|
147
|
-
|
148
|
-
width = get_width
|
149
|
-
if line.length == width - 1
|
150
|
-
@out.print(line + eol)
|
151
|
-
@out.flush
|
152
|
-
elsif line.length >= width
|
153
|
-
@terminal_width = [@terminal_width - (line.length - width + 1), 0].max
|
154
|
-
if @terminal_width == 0 then @out.print(line + eol) else show end
|
155
|
-
else # line.length < width - 1
|
156
|
-
@terminal_width += width - line.length + 1
|
157
|
-
show
|
158
|
-
end
|
159
|
-
@previous_time = Time.now
|
160
|
-
end
|
161
|
-
|
162
|
-
def show_if_needed
|
163
|
-
if @total.zero?
|
164
|
-
cur_percentage = 100
|
165
|
-
prev_percentage = 0
|
166
|
-
else
|
167
|
-
cur_percentage = (@current * 100 / @total).to_i
|
168
|
-
prev_percentage = (@previous * 100 / @total).to_i
|
169
|
-
end
|
170
|
-
|
171
|
-
# Use "!=" instead of ">" to support negative changes
|
172
|
-
if cur_percentage != prev_percentage ||
|
173
|
-
Time.now - @previous_time >= 1 || @finished_p
|
174
|
-
show
|
175
|
-
end
|
176
|
-
end
|
177
|
-
|
178
|
-
public
|
179
|
-
def clear
|
180
|
-
@out.print "\r"
|
181
|
-
@out.print(" " * (get_width - 1))
|
182
|
-
@out.print "\r"
|
183
|
-
end
|
184
|
-
|
185
|
-
def finish
|
186
|
-
@current = @total
|
187
|
-
@finished_p = true
|
188
|
-
show
|
189
|
-
end
|
190
|
-
|
191
|
-
def finished?
|
192
|
-
@finished_p
|
193
|
-
end
|
194
|
-
|
195
|
-
def file_transfer_mode
|
196
|
-
@format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer]
|
197
|
-
end
|
198
|
-
|
199
|
-
def format= (format)
|
200
|
-
@format = format
|
201
|
-
end
|
202
|
-
|
203
|
-
def format_arguments= (arguments)
|
204
|
-
@format_arguments = arguments
|
205
|
-
end
|
206
|
-
|
207
|
-
def halt
|
208
|
-
@finished_p = true
|
209
|
-
show
|
210
|
-
end
|
211
|
-
|
212
|
-
def inc (step = 1)
|
213
|
-
@current += step
|
214
|
-
@current = @total if @current > @total
|
215
|
-
show_if_needed
|
216
|
-
@previous = @current
|
217
|
-
end
|
218
|
-
|
219
|
-
def set (count)
|
220
|
-
if count < 0 || count > @total
|
221
|
-
raise "invalid count: #{count} (total: #{@total})"
|
222
|
-
end
|
223
|
-
@current = count
|
224
|
-
show_if_needed
|
225
|
-
@previous = @current
|
226
|
-
end
|
227
|
-
|
228
|
-
def inspect
|
229
|
-
"#<ProgressBar:#{@current}/#{@total}>"
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
class ReversedProgressBar < ProgressBar
|
234
|
-
def do_percentage
|
235
|
-
100 - super
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
##########################################
|
240
|
-
|
241
|
-
class NewProgressBar < ProgressBar
|
242
|
-
|
243
|
-
attr_accessor :nhits, :nsentences
|
244
|
-
|
245
|
-
alias org_initialize initialize
|
246
|
-
|
247
|
-
def initialize(filename, size)
|
248
|
-
@nhits = 0
|
249
|
-
@nsentences = 0
|
250
|
-
org_initialize(File.basename(filename), size)
|
251
|
-
end
|
252
|
-
|
253
|
-
alias org_fmt_stat fmt_stat
|
254
|
-
|
255
|
-
def fmt_stat
|
256
|
-
org_fmt_stat # + " Hits: " + @nhits.to_s
|
257
|
-
end
|
258
|
-
end
|
259
|
-
|
260
|
-
class CmdProgbar
|
261
|
-
|
262
|
-
attr_accessor :pbar, :last_value
|
263
|
-
|
264
|
-
def initialize
|
265
|
-
@last_value = 0
|
266
|
-
@pbar = nil
|
267
|
-
end
|
268
|
-
|
269
|
-
def msg(str, i = nil)
|
270
|
-
case i
|
271
|
-
when 0
|
272
|
-
print str
|
273
|
-
else
|
274
|
-
puts str
|
275
|
-
end
|
276
|
-
end
|
277
|
-
|
278
|
-
def prg_update(value, elt, eta)
|
279
|
-
@elt = elt
|
280
|
-
@eta = eta
|
281
|
-
offset = value - @last_value
|
282
|
-
@pbar.inc(offset.to_i)
|
283
|
-
@last_value = value
|
284
|
-
end
|
285
|
-
|
286
|
-
def data_set(filename, linesize)
|
287
|
-
@pbar = NewProgressBar.new(filename, linesize)
|
288
|
-
end
|
289
|
-
|
290
|
-
def data_update(nhits, nsentences)
|
291
|
-
@pbar.nhits = nhits
|
292
|
-
@pbar.nsentences = nsentences
|
293
|
-
end
|
294
|
-
|
295
|
-
def before
|
296
|
-
initialize
|
297
|
-
end
|
298
|
-
|
299
|
-
def after
|
300
|
-
@pbar.finish
|
301
|
-
return true
|
302
|
-
end
|
303
|
-
|
304
|
-
end
|
305
|
-
end
|