wp2txt 0.9.5.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/benchmark.rb DELETED
@@ -1,76 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
5
- $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
-
7
- require 'wp2txt'
8
- require 'wp2txt/utils'
9
- include Wp2txt
10
- require 'benchmark'
11
-
12
- data_dir = File.join(File.dirname(__FILE__), '..', "data")
13
-
14
- parent = Wp2txt::CmdProgbar.new
15
- input_file = File.join(data_dir, "testdata_ja.bz2")
16
- output_dir = data_dir
17
- tfile_size = 10
18
- num_threads = 1
19
- convert = true
20
- strip_tmarker = true
21
-
22
- Benchmark.bm do |x|
23
- x.report do
24
- wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
25
- wpconv.extract_text do |article|
26
- format_wiki!(article.title)
27
- title = "[[#{article.title}]]\n"
28
- convert_characters!(title)
29
-
30
- contents = "\nCATEGORIES: "
31
- contents += article.categories.join(", ")
32
- contents += "\n\n"
33
-
34
- article.elements.each do |e|
35
- case e.first
36
- when :mw_heading
37
- format_wiki!(e.last)
38
- line = e.last
39
- when :mw_paragraph
40
- format_wiki!(e.last)
41
- line = e.last
42
- when :mw_table, :mw_htable
43
- format_wiki!(e.last)
44
- line = e.last
45
- when :mw_pre
46
- line = e.last
47
- when :mw_quote
48
- format_wiki!(e.last)
49
- line = e.last
50
- when :mw_unordered, :mw_ordered, :mw_definition
51
- format_wiki!(e.last)
52
- line = e.last
53
- when :mw_redirect
54
- format_wiki!(e.last)
55
- line = e.last
56
- line += "\n\n"
57
- else
58
- next
59
- end
60
- contents << line
61
- end
62
- format_wiki!(contents)
63
- convert_characters!(contents)
64
-
65
- ##### cleanup #####
66
- if /\A[\s ]*\z/m =~ contents
67
- result = ""
68
- else
69
- result = title + "\n" + contents
70
- end
71
- result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
72
- result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
73
- end
74
- end
75
- end
76
-
data/lib/wp2txt/mw_api.rb DELETED
@@ -1,65 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
5
-
6
- require 'uri'
7
- require 'net/http'
8
- require 'json'
9
- require 'utils'
10
-
11
-
12
- module Wp2txt
13
-
14
- def post_request(uri_string, data={})
15
- data = data.map{ |k, v| "#{k}=#{v}" }.join("&")
16
- uri = URI.parse(uri_string)
17
- uri.path = "/" if uri.path.empty?
18
- http = Net::HTTP.new(uri.host)
19
- return http.post(uri.path, data).body
20
- end
21
-
22
- def expand_template(uri, template, page)
23
- text = URI.escape(template)
24
- title = URI.escape(page)
25
- data = {"action" => "expandtemplates",
26
- "format" => "json",
27
- "text" => text,
28
- "title" => title}
29
- jsn = post_request(uri, data)
30
- hash = JSON.parse(jsn)
31
- begin
32
- result = hash["expandtemplates"]["*"]
33
- result = special_chr(result)
34
- return chrref_to_utf(result).gsub("{{", "&#123;&#123;").gsub("}}", "&#125;&#125;")
35
- rescue => e
36
- puts "ERROR!"
37
- p e
38
- exit
39
- template
40
- end
41
- end
42
-
43
- def parse_wikitext(uri, wikitext, page)
44
- text = URI.escape(wikitext)
45
- title = URI.escape(page)
46
- data = {"action" => "parse",
47
- "format" => "json",
48
- "text" => text,
49
- "title" => title}
50
- jsn = post_request(uri, data)
51
- hash = JSON.parse(jsn)
52
- begin
53
- result = hash["parse"]["text"]["*"]
54
- result = special_chr(result)
55
- return chrref_to_utf(result).gsub("[[", "&#91;&#91;").gsub("]]", "&#93;&#93;")
56
- rescue => e
57
- puts "ERROR!"
58
- p e
59
- exit
60
- template
61
- end
62
- end
63
-
64
- end
65
-
@@ -1,305 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
- #
4
- # Ruby/ProgressBar - a text progress bar library
5
- #
6
- # Copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>
7
- # All rights reserved.
8
- # This is free software with ABSOLUTELY NO WARRANTY.
9
- #
10
- # You can redistribute it and/or modify it under the terms
11
- # of Ruby's license.
12
- #
13
-
14
- module Wp2txt
15
- class ProgressBar
16
-
17
- def initialize (title, total, out = STDERR)
18
- @title = title
19
- @total = total
20
- @out = out
21
- @terminal_width = 80
22
- @bar_mark = "o"
23
- @current = 0
24
- @previous = 0
25
- @finished_p = false
26
- @start_time = Time.now
27
- @previous_time = @start_time
28
- @title_width = 14
29
- @format = "%-#{@title_width}s %3d%% %s %s"
30
- @format_arguments = [:title, :percentage, :bar, :stat]
31
- clear
32
- show
33
- end
34
- attr_reader :title
35
- attr_reader :current
36
- attr_reader :total
37
- attr_accessor :start_time
38
-
39
- private
40
- def fmt_bar
41
- bar_width = do_percentage * @terminal_width / 100
42
- sprintf("|%s%s|",
43
- @bar_mark * bar_width,
44
- " " * (@terminal_width - bar_width))
45
- end
46
-
47
- def fmt_percentage
48
- do_percentage
49
- end
50
-
51
- def fmt_stat
52
- if @finished_p then elapsed else eta end
53
- end
54
-
55
- def fmt_stat_for_file_transfer
56
- if @finished_p then
57
- sprintf("%s %s %s", bytes, transfer_rate, elapsed)
58
- else
59
- sprintf("%s %s %s", bytes, transfer_rate, eta)
60
- end
61
- end
62
-
63
- def fmt_title
64
- @title[0,(@title_width - 1)] + ":"
65
- end
66
-
67
- def convert_bytes (bytes)
68
- if bytes < 1024
69
- sprintf("%6dB", bytes)
70
- elsif bytes < 1024 * 1000 # 1000kb
71
- sprintf("%5.1fKB", bytes.to_f / 1024)
72
- elsif bytes < 1024 * 1024 * 1000 # 1000mb
73
- sprintf("%5.1fMB", bytes.to_f / 1024 / 1024)
74
- else
75
- sprintf("%5.1fGB", bytes.to_f / 1024 / 1024 / 1024)
76
- end
77
- end
78
-
79
- def transfer_rate
80
- bytes_per_second = @current.to_f / (Time.now - @start_time)
81
- sprintf("%s/s", convert_bytes(bytes_per_second))
82
- end
83
-
84
- def bytes
85
- convert_bytes(@current)
86
- end
87
-
88
- def format_time (t)
89
- t = t.to_i
90
- sec = t % 60
91
- min = (t / 60) % 60
92
- hour = t / 3600
93
- sprintf("%02d:%02d:%02d", hour, min, sec);
94
- end
95
-
96
- # ETA stands for Estimated Time of Arrival.
97
- def eta
98
- if @current == 0
99
- "ETA: --:--:--"
100
- else
101
- elapsed = Time.now - @start_time
102
- eta = elapsed * @total / @current - elapsed;
103
- sprintf("ETA: %s", format_time(eta))
104
- end
105
- end
106
-
107
- def elapsed
108
- elapsed = Time.now - @start_time
109
- sprintf("Time: %s", format_time(elapsed))
110
- end
111
-
112
- def eol
113
- if @finished_p then "\n" else "\r" end
114
- end
115
-
116
- def do_percentage
117
- if @total.zero?
118
- 100
119
- else
120
- @current * 100 / @total
121
- end
122
- end
123
-
124
- def get_width
125
- # FIXME: I don't know how portable it is.
126
- default_width = 80
127
- begin
128
- tiocgwinsz = 0x5413
129
- data = [0, 0, 0, 0].pack("SSSS")
130
- if @out.ioctl(tiocgwinsz, data) >= 0 then
131
- rows, cols, xpixels, ypixels = data.unpack("SSSS")
132
- if cols >= 0 then cols else default_width end
133
- else
134
- default_width
135
- end
136
- rescue Exception
137
- default_width
138
- end
139
- end
140
-
141
- def show
142
- arguments = @format_arguments.map {|method|
143
- method = sprintf("fmt_%s", method)
144
- send(method)
145
- }
146
- line = sprintf(@format, *arguments)
147
-
148
- width = get_width
149
- if line.length == width - 1
150
- @out.print(line + eol)
151
- @out.flush
152
- elsif line.length >= width
153
- @terminal_width = [@terminal_width - (line.length - width + 1), 0].max
154
- if @terminal_width == 0 then @out.print(line + eol) else show end
155
- else # line.length < width - 1
156
- @terminal_width += width - line.length + 1
157
- show
158
- end
159
- @previous_time = Time.now
160
- end
161
-
162
- def show_if_needed
163
- if @total.zero?
164
- cur_percentage = 100
165
- prev_percentage = 0
166
- else
167
- cur_percentage = (@current * 100 / @total).to_i
168
- prev_percentage = (@previous * 100 / @total).to_i
169
- end
170
-
171
- # Use "!=" instead of ">" to support negative changes
172
- if cur_percentage != prev_percentage ||
173
- Time.now - @previous_time >= 1 || @finished_p
174
- show
175
- end
176
- end
177
-
178
- public
179
- def clear
180
- @out.print "\r"
181
- @out.print(" " * (get_width - 1))
182
- @out.print "\r"
183
- end
184
-
185
- def finish
186
- @current = @total
187
- @finished_p = true
188
- show
189
- end
190
-
191
- def finished?
192
- @finished_p
193
- end
194
-
195
- def file_transfer_mode
196
- @format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer]
197
- end
198
-
199
- def format= (format)
200
- @format = format
201
- end
202
-
203
- def format_arguments= (arguments)
204
- @format_arguments = arguments
205
- end
206
-
207
- def halt
208
- @finished_p = true
209
- show
210
- end
211
-
212
- def inc (step = 1)
213
- @current += step
214
- @current = @total if @current > @total
215
- show_if_needed
216
- @previous = @current
217
- end
218
-
219
- def set (count)
220
- if count < 0 || count > @total
221
- raise "invalid count: #{count} (total: #{@total})"
222
- end
223
- @current = count
224
- show_if_needed
225
- @previous = @current
226
- end
227
-
228
- def inspect
229
- "#<ProgressBar:#{@current}/#{@total}>"
230
- end
231
- end
232
-
233
- class ReversedProgressBar < ProgressBar
234
- def do_percentage
235
- 100 - super
236
- end
237
- end
238
-
239
- ##########################################
240
-
241
- class NewProgressBar < ProgressBar
242
-
243
- attr_accessor :nhits, :nsentences
244
-
245
- alias org_initialize initialize
246
-
247
- def initialize(filename, size)
248
- @nhits = 0
249
- @nsentences = 0
250
- org_initialize(File.basename(filename), size)
251
- end
252
-
253
- alias org_fmt_stat fmt_stat
254
-
255
- def fmt_stat
256
- org_fmt_stat # + " Hits: " + @nhits.to_s
257
- end
258
- end
259
-
260
- class CmdProgbar
261
-
262
- attr_accessor :pbar, :last_value
263
-
264
- def initialize
265
- @last_value = 0
266
- @pbar = nil
267
- end
268
-
269
- def msg(str, i = nil)
270
- case i
271
- when 0
272
- print str
273
- else
274
- puts str
275
- end
276
- end
277
-
278
- def prg_update(value, elt, eta)
279
- @elt = elt
280
- @eta = eta
281
- offset = value - @last_value
282
- @pbar.inc(offset.to_i)
283
- @last_value = value
284
- end
285
-
286
- def data_set(filename, linesize)
287
- @pbar = NewProgressBar.new(filename, linesize)
288
- end
289
-
290
- def data_update(nhits, nsentences)
291
- @pbar.nhits = nhits
292
- @pbar.nsentences = nsentences
293
- end
294
-
295
- def before
296
- initialize
297
- end
298
-
299
- def after
300
- @pbar.finish
301
- return true
302
- end
303
-
304
- end
305
- end