wp2txt 0.9.5.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +99 -58
- data/bin/wp2txt +143 -95
- data/data/output_samples/testdata_en.txt +171 -1247
- data/data/output_samples/{testdata_en_categories.txt → testdata_en_category.txt} +1 -1
- data/data/output_samples/testdata_en_summary.txt +28 -20
- data/data/output_samples/testdata_ja.txt +10359 -17093
- data/data/output_samples/{testdata_ja_categories.txt → testdata_ja_category.txt} +30 -30
- data/data/output_samples/testdata_ja_summary.txt +36 -160
- data/image/screenshot.png +0 -0
- data/image/wp2txt-logo.svg +16 -0
- data/image/wp2txt.svg +31 -0
- data/lib/wp2txt/article.rb +1 -3
- data/lib/wp2txt/utils.rb +48 -24
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +118 -148
- data/spec/utils_spec.rb +3 -21
- data/wp2txt.gemspec +4 -0
- metadata +50 -9
- data/bin/benchmark.rb +0 -76
- data/lib/wp2txt/mw_api.rb +0 -65
- data/lib/wp2txt/progressbar.rb +0 -305
data/bin/benchmark.rb
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
-
|
7
|
-
require 'wp2txt'
|
8
|
-
require 'wp2txt/utils'
|
9
|
-
include Wp2txt
|
10
|
-
require 'benchmark'
|
11
|
-
|
12
|
-
data_dir = File.join(File.dirname(__FILE__), '..', "data")
|
13
|
-
|
14
|
-
parent = Wp2txt::CmdProgbar.new
|
15
|
-
input_file = File.join(data_dir, "testdata_ja.bz2")
|
16
|
-
output_dir = data_dir
|
17
|
-
tfile_size = 10
|
18
|
-
num_threads = 1
|
19
|
-
convert = true
|
20
|
-
strip_tmarker = true
|
21
|
-
|
22
|
-
Benchmark.bm do |x|
|
23
|
-
x.report do
|
24
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
|
25
|
-
wpconv.extract_text do |article|
|
26
|
-
format_wiki!(article.title)
|
27
|
-
title = "[[#{article.title}]]\n"
|
28
|
-
convert_characters!(title)
|
29
|
-
|
30
|
-
contents = "\nCATEGORIES: "
|
31
|
-
contents += article.categories.join(", ")
|
32
|
-
contents += "\n\n"
|
33
|
-
|
34
|
-
article.elements.each do |e|
|
35
|
-
case e.first
|
36
|
-
when :mw_heading
|
37
|
-
format_wiki!(e.last)
|
38
|
-
line = e.last
|
39
|
-
when :mw_paragraph
|
40
|
-
format_wiki!(e.last)
|
41
|
-
line = e.last
|
42
|
-
when :mw_table, :mw_htable
|
43
|
-
format_wiki!(e.last)
|
44
|
-
line = e.last
|
45
|
-
when :mw_pre
|
46
|
-
line = e.last
|
47
|
-
when :mw_quote
|
48
|
-
format_wiki!(e.last)
|
49
|
-
line = e.last
|
50
|
-
when :mw_unordered, :mw_ordered, :mw_definition
|
51
|
-
format_wiki!(e.last)
|
52
|
-
line = e.last
|
53
|
-
when :mw_redirect
|
54
|
-
format_wiki!(e.last)
|
55
|
-
line = e.last
|
56
|
-
line += "\n\n"
|
57
|
-
else
|
58
|
-
next
|
59
|
-
end
|
60
|
-
contents << line
|
61
|
-
end
|
62
|
-
format_wiki!(contents)
|
63
|
-
convert_characters!(contents)
|
64
|
-
|
65
|
-
##### cleanup #####
|
66
|
-
if /\A[\s ]*\z/m =~ contents
|
67
|
-
result = ""
|
68
|
-
else
|
69
|
-
result = title + "\n" + contents
|
70
|
-
end
|
71
|
-
result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
|
72
|
-
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
data/lib/wp2txt/mw_api.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
|
6
|
-
require 'uri'
|
7
|
-
require 'net/http'
|
8
|
-
require 'json'
|
9
|
-
require 'utils'
|
10
|
-
|
11
|
-
|
12
|
-
module Wp2txt
|
13
|
-
|
14
|
-
def post_request(uri_string, data={})
|
15
|
-
data = data.map{ |k, v| "#{k}=#{v}" }.join("&")
|
16
|
-
uri = URI.parse(uri_string)
|
17
|
-
uri.path = "/" if uri.path.empty?
|
18
|
-
http = Net::HTTP.new(uri.host)
|
19
|
-
return http.post(uri.path, data).body
|
20
|
-
end
|
21
|
-
|
22
|
-
def expand_template(uri, template, page)
|
23
|
-
text = URI.escape(template)
|
24
|
-
title = URI.escape(page)
|
25
|
-
data = {"action" => "expandtemplates",
|
26
|
-
"format" => "json",
|
27
|
-
"text" => text,
|
28
|
-
"title" => title}
|
29
|
-
jsn = post_request(uri, data)
|
30
|
-
hash = JSON.parse(jsn)
|
31
|
-
begin
|
32
|
-
result = hash["expandtemplates"]["*"]
|
33
|
-
result = special_chr(result)
|
34
|
-
return chrref_to_utf(result).gsub("{{", "{{").gsub("}}", "}}")
|
35
|
-
rescue => e
|
36
|
-
puts "ERROR!"
|
37
|
-
p e
|
38
|
-
exit
|
39
|
-
template
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def parse_wikitext(uri, wikitext, page)
|
44
|
-
text = URI.escape(wikitext)
|
45
|
-
title = URI.escape(page)
|
46
|
-
data = {"action" => "parse",
|
47
|
-
"format" => "json",
|
48
|
-
"text" => text,
|
49
|
-
"title" => title}
|
50
|
-
jsn = post_request(uri, data)
|
51
|
-
hash = JSON.parse(jsn)
|
52
|
-
begin
|
53
|
-
result = hash["parse"]["text"]["*"]
|
54
|
-
result = special_chr(result)
|
55
|
-
return chrref_to_utf(result).gsub("[[", "[[").gsub("]]", "]]")
|
56
|
-
rescue => e
|
57
|
-
puts "ERROR!"
|
58
|
-
p e
|
59
|
-
exit
|
60
|
-
template
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
|
data/lib/wp2txt/progressbar.rb
DELETED
@@ -1,305 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
#
|
4
|
-
# Ruby/ProgressBar - a text progress bar library
|
5
|
-
#
|
6
|
-
# Copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>
|
7
|
-
# All rights reserved.
|
8
|
-
# This is free software with ABSOLUTELY NO WARRANTY.
|
9
|
-
#
|
10
|
-
# You can redistribute it and/or modify it under the terms
|
11
|
-
# of Ruby's license.
|
12
|
-
#
|
13
|
-
|
14
|
-
module Wp2txt
|
15
|
-
class ProgressBar
|
16
|
-
|
17
|
-
def initialize (title, total, out = STDERR)
|
18
|
-
@title = title
|
19
|
-
@total = total
|
20
|
-
@out = out
|
21
|
-
@terminal_width = 80
|
22
|
-
@bar_mark = "o"
|
23
|
-
@current = 0
|
24
|
-
@previous = 0
|
25
|
-
@finished_p = false
|
26
|
-
@start_time = Time.now
|
27
|
-
@previous_time = @start_time
|
28
|
-
@title_width = 14
|
29
|
-
@format = "%-#{@title_width}s %3d%% %s %s"
|
30
|
-
@format_arguments = [:title, :percentage, :bar, :stat]
|
31
|
-
clear
|
32
|
-
show
|
33
|
-
end
|
34
|
-
attr_reader :title
|
35
|
-
attr_reader :current
|
36
|
-
attr_reader :total
|
37
|
-
attr_accessor :start_time
|
38
|
-
|
39
|
-
private
|
40
|
-
def fmt_bar
|
41
|
-
bar_width = do_percentage * @terminal_width / 100
|
42
|
-
sprintf("|%s%s|",
|
43
|
-
@bar_mark * bar_width,
|
44
|
-
" " * (@terminal_width - bar_width))
|
45
|
-
end
|
46
|
-
|
47
|
-
def fmt_percentage
|
48
|
-
do_percentage
|
49
|
-
end
|
50
|
-
|
51
|
-
def fmt_stat
|
52
|
-
if @finished_p then elapsed else eta end
|
53
|
-
end
|
54
|
-
|
55
|
-
def fmt_stat_for_file_transfer
|
56
|
-
if @finished_p then
|
57
|
-
sprintf("%s %s %s", bytes, transfer_rate, elapsed)
|
58
|
-
else
|
59
|
-
sprintf("%s %s %s", bytes, transfer_rate, eta)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
def fmt_title
|
64
|
-
@title[0,(@title_width - 1)] + ":"
|
65
|
-
end
|
66
|
-
|
67
|
-
def convert_bytes (bytes)
|
68
|
-
if bytes < 1024
|
69
|
-
sprintf("%6dB", bytes)
|
70
|
-
elsif bytes < 1024 * 1000 # 1000kb
|
71
|
-
sprintf("%5.1fKB", bytes.to_f / 1024)
|
72
|
-
elsif bytes < 1024 * 1024 * 1000 # 1000mb
|
73
|
-
sprintf("%5.1fMB", bytes.to_f / 1024 / 1024)
|
74
|
-
else
|
75
|
-
sprintf("%5.1fGB", bytes.to_f / 1024 / 1024 / 1024)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def transfer_rate
|
80
|
-
bytes_per_second = @current.to_f / (Time.now - @start_time)
|
81
|
-
sprintf("%s/s", convert_bytes(bytes_per_second))
|
82
|
-
end
|
83
|
-
|
84
|
-
def bytes
|
85
|
-
convert_bytes(@current)
|
86
|
-
end
|
87
|
-
|
88
|
-
def format_time (t)
|
89
|
-
t = t.to_i
|
90
|
-
sec = t % 60
|
91
|
-
min = (t / 60) % 60
|
92
|
-
hour = t / 3600
|
93
|
-
sprintf("%02d:%02d:%02d", hour, min, sec);
|
94
|
-
end
|
95
|
-
|
96
|
-
# ETA stands for Estimated Time of Arrival.
|
97
|
-
def eta
|
98
|
-
if @current == 0
|
99
|
-
"ETA: --:--:--"
|
100
|
-
else
|
101
|
-
elapsed = Time.now - @start_time
|
102
|
-
eta = elapsed * @total / @current - elapsed;
|
103
|
-
sprintf("ETA: %s", format_time(eta))
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def elapsed
|
108
|
-
elapsed = Time.now - @start_time
|
109
|
-
sprintf("Time: %s", format_time(elapsed))
|
110
|
-
end
|
111
|
-
|
112
|
-
def eol
|
113
|
-
if @finished_p then "\n" else "\r" end
|
114
|
-
end
|
115
|
-
|
116
|
-
def do_percentage
|
117
|
-
if @total.zero?
|
118
|
-
100
|
119
|
-
else
|
120
|
-
@current * 100 / @total
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
def get_width
|
125
|
-
# FIXME: I don't know how portable it is.
|
126
|
-
default_width = 80
|
127
|
-
begin
|
128
|
-
tiocgwinsz = 0x5413
|
129
|
-
data = [0, 0, 0, 0].pack("SSSS")
|
130
|
-
if @out.ioctl(tiocgwinsz, data) >= 0 then
|
131
|
-
rows, cols, xpixels, ypixels = data.unpack("SSSS")
|
132
|
-
if cols >= 0 then cols else default_width end
|
133
|
-
else
|
134
|
-
default_width
|
135
|
-
end
|
136
|
-
rescue Exception
|
137
|
-
default_width
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def show
|
142
|
-
arguments = @format_arguments.map {|method|
|
143
|
-
method = sprintf("fmt_%s", method)
|
144
|
-
send(method)
|
145
|
-
}
|
146
|
-
line = sprintf(@format, *arguments)
|
147
|
-
|
148
|
-
width = get_width
|
149
|
-
if line.length == width - 1
|
150
|
-
@out.print(line + eol)
|
151
|
-
@out.flush
|
152
|
-
elsif line.length >= width
|
153
|
-
@terminal_width = [@terminal_width - (line.length - width + 1), 0].max
|
154
|
-
if @terminal_width == 0 then @out.print(line + eol) else show end
|
155
|
-
else # line.length < width - 1
|
156
|
-
@terminal_width += width - line.length + 1
|
157
|
-
show
|
158
|
-
end
|
159
|
-
@previous_time = Time.now
|
160
|
-
end
|
161
|
-
|
162
|
-
def show_if_needed
|
163
|
-
if @total.zero?
|
164
|
-
cur_percentage = 100
|
165
|
-
prev_percentage = 0
|
166
|
-
else
|
167
|
-
cur_percentage = (@current * 100 / @total).to_i
|
168
|
-
prev_percentage = (@previous * 100 / @total).to_i
|
169
|
-
end
|
170
|
-
|
171
|
-
# Use "!=" instead of ">" to support negative changes
|
172
|
-
if cur_percentage != prev_percentage ||
|
173
|
-
Time.now - @previous_time >= 1 || @finished_p
|
174
|
-
show
|
175
|
-
end
|
176
|
-
end
|
177
|
-
|
178
|
-
public
|
179
|
-
def clear
|
180
|
-
@out.print "\r"
|
181
|
-
@out.print(" " * (get_width - 1))
|
182
|
-
@out.print "\r"
|
183
|
-
end
|
184
|
-
|
185
|
-
def finish
|
186
|
-
@current = @total
|
187
|
-
@finished_p = true
|
188
|
-
show
|
189
|
-
end
|
190
|
-
|
191
|
-
def finished?
|
192
|
-
@finished_p
|
193
|
-
end
|
194
|
-
|
195
|
-
def file_transfer_mode
|
196
|
-
@format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer]
|
197
|
-
end
|
198
|
-
|
199
|
-
def format= (format)
|
200
|
-
@format = format
|
201
|
-
end
|
202
|
-
|
203
|
-
def format_arguments= (arguments)
|
204
|
-
@format_arguments = arguments
|
205
|
-
end
|
206
|
-
|
207
|
-
def halt
|
208
|
-
@finished_p = true
|
209
|
-
show
|
210
|
-
end
|
211
|
-
|
212
|
-
def inc (step = 1)
|
213
|
-
@current += step
|
214
|
-
@current = @total if @current > @total
|
215
|
-
show_if_needed
|
216
|
-
@previous = @current
|
217
|
-
end
|
218
|
-
|
219
|
-
def set (count)
|
220
|
-
if count < 0 || count > @total
|
221
|
-
raise "invalid count: #{count} (total: #{@total})"
|
222
|
-
end
|
223
|
-
@current = count
|
224
|
-
show_if_needed
|
225
|
-
@previous = @current
|
226
|
-
end
|
227
|
-
|
228
|
-
def inspect
|
229
|
-
"#<ProgressBar:#{@current}/#{@total}>"
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
class ReversedProgressBar < ProgressBar
|
234
|
-
def do_percentage
|
235
|
-
100 - super
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
##########################################
|
240
|
-
|
241
|
-
class NewProgressBar < ProgressBar
|
242
|
-
|
243
|
-
attr_accessor :nhits, :nsentences
|
244
|
-
|
245
|
-
alias org_initialize initialize
|
246
|
-
|
247
|
-
def initialize(filename, size)
|
248
|
-
@nhits = 0
|
249
|
-
@nsentences = 0
|
250
|
-
org_initialize(File.basename(filename), size)
|
251
|
-
end
|
252
|
-
|
253
|
-
alias org_fmt_stat fmt_stat
|
254
|
-
|
255
|
-
def fmt_stat
|
256
|
-
org_fmt_stat # + " Hits: " + @nhits.to_s
|
257
|
-
end
|
258
|
-
end
|
259
|
-
|
260
|
-
class CmdProgbar
|
261
|
-
|
262
|
-
attr_accessor :pbar, :last_value
|
263
|
-
|
264
|
-
def initialize
|
265
|
-
@last_value = 0
|
266
|
-
@pbar = nil
|
267
|
-
end
|
268
|
-
|
269
|
-
def msg(str, i = nil)
|
270
|
-
case i
|
271
|
-
when 0
|
272
|
-
print str
|
273
|
-
else
|
274
|
-
puts str
|
275
|
-
end
|
276
|
-
end
|
277
|
-
|
278
|
-
def prg_update(value, elt, eta)
|
279
|
-
@elt = elt
|
280
|
-
@eta = eta
|
281
|
-
offset = value - @last_value
|
282
|
-
@pbar.inc(offset.to_i)
|
283
|
-
@last_value = value
|
284
|
-
end
|
285
|
-
|
286
|
-
def data_set(filename, linesize)
|
287
|
-
@pbar = NewProgressBar.new(filename, linesize)
|
288
|
-
end
|
289
|
-
|
290
|
-
def data_update(nhits, nsentences)
|
291
|
-
@pbar.nhits = nhits
|
292
|
-
@pbar.nsentences = nsentences
|
293
|
-
end
|
294
|
-
|
295
|
-
def before
|
296
|
-
initialize
|
297
|
-
end
|
298
|
-
|
299
|
-
def after
|
300
|
-
@pbar.finish
|
301
|
-
return true
|
302
|
-
end
|
303
|
-
|
304
|
-
end
|
305
|
-
end
|