wp2txt 0.9.5.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/benchmark.rb DELETED
@@ -1,76 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
5
- $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
-
7
- require 'wp2txt'
8
- require 'wp2txt/utils'
9
- include Wp2txt
10
- require 'benchmark'
11
-
12
- data_dir = File.join(File.dirname(__FILE__), '..', "data")
13
-
14
- parent = Wp2txt::CmdProgbar.new
15
- input_file = File.join(data_dir, "testdata_ja.bz2")
16
- output_dir = data_dir
17
- tfile_size = 10
18
- num_threads = 1
19
- convert = true
20
- strip_tmarker = true
21
-
22
- Benchmark.bm do |x|
23
- x.report do
24
- wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
25
- wpconv.extract_text do |article|
26
- format_wiki!(article.title)
27
- title = "[[#{article.title}]]\n"
28
- convert_characters!(title)
29
-
30
- contents = "\nCATEGORIES: "
31
- contents += article.categories.join(", ")
32
- contents += "\n\n"
33
-
34
- article.elements.each do |e|
35
- case e.first
36
- when :mw_heading
37
- format_wiki!(e.last)
38
- line = e.last
39
- when :mw_paragraph
40
- format_wiki!(e.last)
41
- line = e.last
42
- when :mw_table, :mw_htable
43
- format_wiki!(e.last)
44
- line = e.last
45
- when :mw_pre
46
- line = e.last
47
- when :mw_quote
48
- format_wiki!(e.last)
49
- line = e.last
50
- when :mw_unordered, :mw_ordered, :mw_definition
51
- format_wiki!(e.last)
52
- line = e.last
53
- when :mw_redirect
54
- format_wiki!(e.last)
55
- line = e.last
56
- line += "\n\n"
57
- else
58
- next
59
- end
60
- contents << line
61
- end
62
- format_wiki!(contents)
63
- convert_characters!(contents)
64
-
65
- ##### cleanup #####
66
- if /\A[\s ]*\z/m =~ contents
67
- result = ""
68
- else
69
- result = title + "\n" + contents
70
- end
71
- result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
72
- result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
73
- end
74
- end
75
- end
76
-
data/lib/wp2txt/mw_api.rb DELETED
@@ -1,65 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
5
-
6
- require 'uri'
7
- require 'net/http'
8
- require 'json'
9
- require 'utils'
10
-
11
-
12
- module Wp2txt
13
-
14
- def post_request(uri_string, data={})
15
- data = data.map{ |k, v| "#{k}=#{v}" }.join("&")
16
- uri = URI.parse(uri_string)
17
- uri.path = "/" if uri.path.empty?
18
- http = Net::HTTP.new(uri.host)
19
- return http.post(uri.path, data).body
20
- end
21
-
22
- def expand_template(uri, template, page)
23
- text = URI.escape(template)
24
- title = URI.escape(page)
25
- data = {"action" => "expandtemplates",
26
- "format" => "json",
27
- "text" => text,
28
- "title" => title}
29
- jsn = post_request(uri, data)
30
- hash = JSON.parse(jsn)
31
- begin
32
- result = hash["expandtemplates"]["*"]
33
- result = special_chr(result)
34
- return chrref_to_utf(result).gsub("{{", "&#123;&#123;").gsub("}}", "&#125;&#125;")
35
- rescue => e
36
- puts "ERROR!"
37
- p e
38
- exit
39
- template
40
- end
41
- end
42
-
43
- def parse_wikitext(uri, wikitext, page)
44
- text = URI.escape(wikitext)
45
- title = URI.escape(page)
46
- data = {"action" => "parse",
47
- "format" => "json",
48
- "text" => text,
49
- "title" => title}
50
- jsn = post_request(uri, data)
51
- hash = JSON.parse(jsn)
52
- begin
53
- result = hash["parse"]["text"]["*"]
54
- result = special_chr(result)
55
- return chrref_to_utf(result).gsub("[[", "&#91;&#91;").gsub("]]", "&#93;&#93;")
56
- rescue => e
57
- puts "ERROR!"
58
- p e
59
- exit
60
- template
61
- end
62
- end
63
-
64
- end
65
-
@@ -1,305 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
- #
4
- # Ruby/ProgressBar - a text progress bar library
5
- #
6
- # Copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>
7
- # All rights reserved.
8
- # This is free software with ABSOLUTELY NO WARRANTY.
9
- #
10
- # You can redistribute it and/or modify it under the terms
11
- # of Ruby's license.
12
- #
13
-
14
- module Wp2txt
15
- class ProgressBar
16
-
17
- def initialize (title, total, out = STDERR)
18
- @title = title
19
- @total = total
20
- @out = out
21
- @terminal_width = 80
22
- @bar_mark = "o"
23
- @current = 0
24
- @previous = 0
25
- @finished_p = false
26
- @start_time = Time.now
27
- @previous_time = @start_time
28
- @title_width = 14
29
- @format = "%-#{@title_width}s %3d%% %s %s"
30
- @format_arguments = [:title, :percentage, :bar, :stat]
31
- clear
32
- show
33
- end
34
- attr_reader :title
35
- attr_reader :current
36
- attr_reader :total
37
- attr_accessor :start_time
38
-
39
- private
40
- def fmt_bar
41
- bar_width = do_percentage * @terminal_width / 100
42
- sprintf("|%s%s|",
43
- @bar_mark * bar_width,
44
- " " * (@terminal_width - bar_width))
45
- end
46
-
47
- def fmt_percentage
48
- do_percentage
49
- end
50
-
51
- def fmt_stat
52
- if @finished_p then elapsed else eta end
53
- end
54
-
55
- def fmt_stat_for_file_transfer
56
- if @finished_p then
57
- sprintf("%s %s %s", bytes, transfer_rate, elapsed)
58
- else
59
- sprintf("%s %s %s", bytes, transfer_rate, eta)
60
- end
61
- end
62
-
63
- def fmt_title
64
- @title[0,(@title_width - 1)] + ":"
65
- end
66
-
67
- def convert_bytes (bytes)
68
- if bytes < 1024
69
- sprintf("%6dB", bytes)
70
- elsif bytes < 1024 * 1000 # 1000kb
71
- sprintf("%5.1fKB", bytes.to_f / 1024)
72
- elsif bytes < 1024 * 1024 * 1000 # 1000mb
73
- sprintf("%5.1fMB", bytes.to_f / 1024 / 1024)
74
- else
75
- sprintf("%5.1fGB", bytes.to_f / 1024 / 1024 / 1024)
76
- end
77
- end
78
-
79
- def transfer_rate
80
- bytes_per_second = @current.to_f / (Time.now - @start_time)
81
- sprintf("%s/s", convert_bytes(bytes_per_second))
82
- end
83
-
84
- def bytes
85
- convert_bytes(@current)
86
- end
87
-
88
- def format_time (t)
89
- t = t.to_i
90
- sec = t % 60
91
- min = (t / 60) % 60
92
- hour = t / 3600
93
- sprintf("%02d:%02d:%02d", hour, min, sec);
94
- end
95
-
96
- # ETA stands for Estimated Time of Arrival.
97
- def eta
98
- if @current == 0
99
- "ETA: --:--:--"
100
- else
101
- elapsed = Time.now - @start_time
102
- eta = elapsed * @total / @current - elapsed;
103
- sprintf("ETA: %s", format_time(eta))
104
- end
105
- end
106
-
107
- def elapsed
108
- elapsed = Time.now - @start_time
109
- sprintf("Time: %s", format_time(elapsed))
110
- end
111
-
112
- def eol
113
- if @finished_p then "\n" else "\r" end
114
- end
115
-
116
- def do_percentage
117
- if @total.zero?
118
- 100
119
- else
120
- @current * 100 / @total
121
- end
122
- end
123
-
124
- def get_width
125
- # FIXME: I don't know how portable it is.
126
- default_width = 80
127
- begin
128
- tiocgwinsz = 0x5413
129
- data = [0, 0, 0, 0].pack("SSSS")
130
- if @out.ioctl(tiocgwinsz, data) >= 0 then
131
- rows, cols, xpixels, ypixels = data.unpack("SSSS")
132
- if cols >= 0 then cols else default_width end
133
- else
134
- default_width
135
- end
136
- rescue Exception
137
- default_width
138
- end
139
- end
140
-
141
- def show
142
- arguments = @format_arguments.map {|method|
143
- method = sprintf("fmt_%s", method)
144
- send(method)
145
- }
146
- line = sprintf(@format, *arguments)
147
-
148
- width = get_width
149
- if line.length == width - 1
150
- @out.print(line + eol)
151
- @out.flush
152
- elsif line.length >= width
153
- @terminal_width = [@terminal_width - (line.length - width + 1), 0].max
154
- if @terminal_width == 0 then @out.print(line + eol) else show end
155
- else # line.length < width - 1
156
- @terminal_width += width - line.length + 1
157
- show
158
- end
159
- @previous_time = Time.now
160
- end
161
-
162
- def show_if_needed
163
- if @total.zero?
164
- cur_percentage = 100
165
- prev_percentage = 0
166
- else
167
- cur_percentage = (@current * 100 / @total).to_i
168
- prev_percentage = (@previous * 100 / @total).to_i
169
- end
170
-
171
- # Use "!=" instead of ">" to support negative changes
172
- if cur_percentage != prev_percentage ||
173
- Time.now - @previous_time >= 1 || @finished_p
174
- show
175
- end
176
- end
177
-
178
- public
179
- def clear
180
- @out.print "\r"
181
- @out.print(" " * (get_width - 1))
182
- @out.print "\r"
183
- end
184
-
185
- def finish
186
- @current = @total
187
- @finished_p = true
188
- show
189
- end
190
-
191
- def finished?
192
- @finished_p
193
- end
194
-
195
- def file_transfer_mode
196
- @format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer]
197
- end
198
-
199
- def format= (format)
200
- @format = format
201
- end
202
-
203
- def format_arguments= (arguments)
204
- @format_arguments = arguments
205
- end
206
-
207
- def halt
208
- @finished_p = true
209
- show
210
- end
211
-
212
- def inc (step = 1)
213
- @current += step
214
- @current = @total if @current > @total
215
- show_if_needed
216
- @previous = @current
217
- end
218
-
219
- def set (count)
220
- if count < 0 || count > @total
221
- raise "invalid count: #{count} (total: #{@total})"
222
- end
223
- @current = count
224
- show_if_needed
225
- @previous = @current
226
- end
227
-
228
- def inspect
229
- "#<ProgressBar:#{@current}/#{@total}>"
230
- end
231
- end
232
-
233
- class ReversedProgressBar < ProgressBar
234
- def do_percentage
235
- 100 - super
236
- end
237
- end
238
-
239
- ##########################################
240
-
241
- class NewProgressBar < ProgressBar
242
-
243
- attr_accessor :nhits, :nsentences
244
-
245
- alias org_initialize initialize
246
-
247
- def initialize(filename, size)
248
- @nhits = 0
249
- @nsentences = 0
250
- org_initialize(File.basename(filename), size)
251
- end
252
-
253
- alias org_fmt_stat fmt_stat
254
-
255
- def fmt_stat
256
- org_fmt_stat # + " Hits: " + @nhits.to_s
257
- end
258
- end
259
-
260
- class CmdProgbar
261
-
262
- attr_accessor :pbar, :last_value
263
-
264
- def initialize
265
- @last_value = 0
266
- @pbar = nil
267
- end
268
-
269
- def msg(str, i = nil)
270
- case i
271
- when 0
272
- print str
273
- else
274
- puts str
275
- end
276
- end
277
-
278
- def prg_update(value, elt, eta)
279
- @elt = elt
280
- @eta = eta
281
- offset = value - @last_value
282
- @pbar.inc(offset.to_i)
283
- @last_value = value
284
- end
285
-
286
- def data_set(filename, linesize)
287
- @pbar = NewProgressBar.new(filename, linesize)
288
- end
289
-
290
- def data_update(nhits, nsentences)
291
- @pbar.nhits = nhits
292
- @pbar.nsentences = nsentences
293
- end
294
-
295
- def before
296
- initialize
297
- end
298
-
299
- def after
300
- @pbar.finish
301
- return true
302
- end
303
-
304
- end
305
- end