wp2txt 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ $: << File.join(File.dirname(__FILE__))
5
+
6
+ require 'strscan'
7
+ require 'utils'
8
+
9
+ module Wp2txt
10
+
11
+ # possible element type, which could be later chosen to print or not to print
12
+ # :mw_heading
13
+ # :mw_htable
14
+ # :mw_quote
15
+ # :mw_unordered
16
+ # :mw_ordered
17
+ # :mw_definition
18
+ # :mw_pre
19
+ # :mw_paragraph
20
+ # :mw_comment
21
+ # :mw_math
22
+ # :mw_source
23
+ # :mw_inputbox
24
+ # :mw_template
25
+ # :mw_link
26
+ # :mw_summary
27
+ # :mw_blank
28
+ # :mw_redirect
29
+
30
+ # an article contains elements, each of which is [TYPE, string]
31
+ class Article
32
+
33
+ include Wp2txt
34
+ attr_accessor :elements, :title
35
+
36
+ # class varialbes to save resource for generating regexps
37
+ # those with a trailing number 1 represent opening tag/markup
38
+ # those with a trailing number 2 represent closing tag/markup
39
+ # those without a trailing number contain both opening/closing tags/markups
40
+
41
+ @@in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
42
+ @@in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
43
+
44
+ @@in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
45
+ @@in_inputbox_regex1 = Regexp.new('<inputbox>')
46
+ @@in_inputbox_regex2 = Regexp.new('<\/inputbox>')
47
+
48
+ @@in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
49
+ @@in_source_regex1 = Regexp.new('<source.*?>')
50
+ @@in_source_regex2 = Regexp.new('<\/source>')
51
+
52
+ @@in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
53
+ @@in_math_regex1 = Regexp.new('<math.*?>')
54
+ @@in_math_regex2 = Regexp.new('<\/math>')
55
+
56
+ @@in_heading_regex = Regexp.new('^=+.*?=+$')
57
+
58
+ @@in_html_table_regex = Regexp.new('<table.*?><\/table>')
59
+ @@in_html_table_regex1 = Regexp.new('<table\b')
60
+ @@in_html_table_regex2 = Regexp.new('<\/\s*table>')
61
+
62
+ @@in_table_regex1 = Regexp.new('^\s*\{\|')
63
+ @@in_table_regex2 = Regexp.new('^\|\}.*?$')
64
+
65
+ @@in_unordered_regex = Regexp.new('^\*')
66
+ @@in_ordered_regex = Regexp.new('^\#')
67
+ @@in_pre_regex = Regexp.new('^ ')
68
+ @@in_definition_regex = Regexp.new('^[\;\:]')
69
+
70
+ @@blank_line_regex = Regexp.new('^\s*$')
71
+
72
+ @@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
73
+
74
+ def initialize(text, title = "", strip_tmarker = false)
75
+ @title = title.strip
76
+ @strip_tmarker = strip_tmarker
77
+ parse text
78
+ end
79
+
80
+ def create_element(tp, text)
81
+ [tp, text]
82
+ end
83
+
84
+ def parse(source)
85
+ @elements = []
86
+ mode = nil
87
+ open_stack = []
88
+ close_stack = []
89
+ source.each_line do |line|
90
+
91
+ case mode
92
+ when :mw_table
93
+ if @@in_table_regex2 =~ line
94
+ mode = nil
95
+ end
96
+ @elements.last.last << line
97
+ next
98
+ when :mw_inputbox
99
+ if @@in_inputbox_regex2 =~ line
100
+ mode = nil
101
+ end
102
+ @elements.last.last << line
103
+ next
104
+ when :mw_source
105
+ if @@in_source_regex2 =~ line
106
+ mode = nil
107
+ end
108
+ @elements.last.last << line
109
+ next
110
+ when :mw_math
111
+ if @@in_math_regex2 =~ line
112
+ mode = nil
113
+ end
114
+ @elements.last.last << line
115
+ next
116
+ when :mw_htable
117
+ if @@in_html_table_regex2 =~ line
118
+ mode = nil
119
+ end
120
+ @elements.last.last << line
121
+ next
122
+ end
123
+
124
+ case line
125
+ when @@blank_line_regex
126
+ @elements << create_element(:mw_blank, "\n")
127
+ when @@redirect_regex
128
+ @elements << create_element(:mw_redirect, line)
129
+ when @@in_template_regex
130
+ @elements << create_element(:mw_template, line)
131
+ when @@in_heading_regex
132
+ @elements << create_element(:mw_heading, "\n" + line + "\n")
133
+ when @@in_inputbox_regex
134
+ @elements << create_element(:mw_inputbox, line)
135
+ when @@in_inputbox_regex1
136
+ mode = :mw_inputbox
137
+ @elements << create_element(:mw_inputbox, line)
138
+ when @@in_source_regex
139
+ @elements << create_element(:mw_source, line)
140
+ when @@in_source_regex1
141
+ mode = :mw_source
142
+ @elements << create_element(:mw_source, line)
143
+ when @@in_math_regex
144
+ @elements << create_element(:mw_math, line)
145
+ when @@in_math_regex1
146
+ mode = :mw_math
147
+ @elements << create_element(:mw_math, line)
148
+ when @@in_html_table_regex
149
+ @elements << create_element(:mw_htable, line)
150
+ when @@in_html_table_regex1
151
+ mode = :mw_htable
152
+ @elements << create_element(:mw_htable, line)
153
+ when @@in_table_regex1
154
+ mode = :mw_table
155
+ @elements << create_element(:mw_table, line)
156
+ when @@in_unordered_regex
157
+ line = line.sub(/\A[\*\#\;\:\ ]+/, "") if @strip_tmarker
158
+ @elements << create_element(:mw_unordered, line)
159
+ when @@in_ordered_regex
160
+ line = line.sub(/\A[\*\#\;\:\ ]+/, "") if @strip_tmarker
161
+ @elements << create_element(:mw_ordered, line)
162
+ when @@in_pre_regex
163
+ line = line.sub(/\A\^\ /, "") if @strip_tmarker
164
+ @elements << create_element(:mw_pre, line)
165
+ when @@in_definition_regex
166
+ line = line.sub(/\A[\;\:\ ]+/, "") if @strip_tmarker
167
+ @elements << create_element(:mw_definition, line)
168
+ when @@in_link_regex
169
+ @elements << create_element(:mw_link, line)
170
+ else
171
+ @elements << create_element(:mw_paragraph, line)
172
+ end
173
+ end
174
+ @elements
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ $: << File.join(File.dirname(__FILE__))
5
+
6
+ require 'uri'
7
+ require 'net/http'
8
+ require 'json'
9
+ require 'utils'
10
+
11
+
12
+ module Wp2txt
13
+
14
+ def post_request(uri_string, data={})
15
+ data = data.map{ |k, v| "#{k}=#{v}" }.join("&")
16
+ uri = URI.parse(uri_string)
17
+ uri.path = "/" if uri.path.empty?
18
+ http = Net::HTTP.new(uri.host)
19
+ return http.post(uri.path, data).body
20
+ end
21
+
22
+ def expand_template(uri, template, page)
23
+ text = URI.escape(template)
24
+ title = URI.escape(page)
25
+ data = {"action" => "expandtemplates",
26
+ "format" => "json",
27
+ "text" => text,
28
+ "title" => title}
29
+ jsn = post_request(uri, data)
30
+ hash = JSON.parse(jsn)
31
+ begin
32
+ result = hash["expandtemplates"]["*"]
33
+ result = special_chr(result)
34
+ return chrref_to_utf(result).gsub("{{", "&#123;&#123;").gsub("}}", "&#125;&#125;")
35
+ rescue => e
36
+ puts "ERROR!"
37
+ p e
38
+ exit
39
+ template
40
+ end
41
+ end
42
+
43
+ def parse_wikitext(uri, wikitext, page)
44
+ text = URI.escape(wikitext)
45
+ title = URI.escape(page)
46
+ data = {"action" => "parse",
47
+ "format" => "json",
48
+ "text" => text,
49
+ "title" => title}
50
+ jsn = post_request(uri, data)
51
+ hash = JSON.parse(jsn)
52
+ begin
53
+ result = hash["parse"]["text"]["*"]
54
+ result = special_chr(result)
55
+ return chrref_to_utf(result).gsub("[[", "&#91;&#91;").gsub("]]", "&#93;&#93;")
56
+ rescue => e
57
+ puts "ERROR!"
58
+ p e
59
+ exit
60
+ template
61
+ end
62
+ end
63
+
64
+ end
65
+
@@ -0,0 +1,305 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ #
4
+ # Ruby/ProgressBar - a text progress bar library
5
+ #
6
+ # Copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>
7
+ # All rights reserved.
8
+ # This is free software with ABSOLUTELY NO WARRANTY.
9
+ #
10
+ # You can redistribute it and/or modify it under the terms
11
+ # of Ruby's license.
12
+ #
13
+
14
+ module Wp2txt
15
+ class ProgressBar
16
+
17
+ def initialize (title, total, out = STDERR)
18
+ @title = title
19
+ @total = total
20
+ @out = out
21
+ @terminal_width = 80
22
+ @bar_mark = "o"
23
+ @current = 0
24
+ @previous = 0
25
+ @finished_p = false
26
+ @start_time = Time.now
27
+ @previous_time = @start_time
28
+ @title_width = 14
29
+ @format = "%-#{@title_width}s %3d%% %s %s"
30
+ @format_arguments = [:title, :percentage, :bar, :stat]
31
+ clear
32
+ show
33
+ end
34
+ attr_reader :title
35
+ attr_reader :current
36
+ attr_reader :total
37
+ attr_accessor :start_time
38
+
39
+ private
40
+ def fmt_bar
41
+ bar_width = do_percentage * @terminal_width / 100
42
+ sprintf("|%s%s|",
43
+ @bar_mark * bar_width,
44
+ " " * (@terminal_width - bar_width))
45
+ end
46
+
47
+ def fmt_percentage
48
+ do_percentage
49
+ end
50
+
51
+ def fmt_stat
52
+ if @finished_p then elapsed else eta end
53
+ end
54
+
55
+ def fmt_stat_for_file_transfer
56
+ if @finished_p then
57
+ sprintf("%s %s %s", bytes, transfer_rate, elapsed)
58
+ else
59
+ sprintf("%s %s %s", bytes, transfer_rate, eta)
60
+ end
61
+ end
62
+
63
+ def fmt_title
64
+ @title[0,(@title_width - 1)] + ":"
65
+ end
66
+
67
+ def convert_bytes (bytes)
68
+ if bytes < 1024
69
+ sprintf("%6dB", bytes)
70
+ elsif bytes < 1024 * 1000 # 1000kb
71
+ sprintf("%5.1fKB", bytes.to_f / 1024)
72
+ elsif bytes < 1024 * 1024 * 1000 # 1000mb
73
+ sprintf("%5.1fMB", bytes.to_f / 1024 / 1024)
74
+ else
75
+ sprintf("%5.1fGB", bytes.to_f / 1024 / 1024 / 1024)
76
+ end
77
+ end
78
+
79
+ def transfer_rate
80
+ bytes_per_second = @current.to_f / (Time.now - @start_time)
81
+ sprintf("%s/s", convert_bytes(bytes_per_second))
82
+ end
83
+
84
+ def bytes
85
+ convert_bytes(@current)
86
+ end
87
+
88
+ def format_time (t)
89
+ t = t.to_i
90
+ sec = t % 60
91
+ min = (t / 60) % 60
92
+ hour = t / 3600
93
+ sprintf("%02d:%02d:%02d", hour, min, sec);
94
+ end
95
+
96
+ # ETA stands for Estimated Time of Arrival.
97
+ def eta
98
+ if @current == 0
99
+ "ETA: --:--:--"
100
+ else
101
+ elapsed = Time.now - @start_time
102
+ eta = elapsed * @total / @current - elapsed;
103
+ sprintf("ETA: %s", format_time(eta))
104
+ end
105
+ end
106
+
107
+ def elapsed
108
+ elapsed = Time.now - @start_time
109
+ sprintf("Time: %s", format_time(elapsed))
110
+ end
111
+
112
+ def eol
113
+ if @finished_p then "\n" else "\r" end
114
+ end
115
+
116
+ def do_percentage
117
+ if @total.zero?
118
+ 100
119
+ else
120
+ @current * 100 / @total
121
+ end
122
+ end
123
+
124
+ def get_width
125
+ # FIXME: I don't know how portable it is.
126
+ default_width = 80
127
+ begin
128
+ tiocgwinsz = 0x5413
129
+ data = [0, 0, 0, 0].pack("SSSS")
130
+ if @out.ioctl(tiocgwinsz, data) >= 0 then
131
+ rows, cols, xpixels, ypixels = data.unpack("SSSS")
132
+ if cols >= 0 then cols else default_width end
133
+ else
134
+ default_width
135
+ end
136
+ rescue Exception
137
+ default_width
138
+ end
139
+ end
140
+
141
+ def show
142
+ arguments = @format_arguments.map {|method|
143
+ method = sprintf("fmt_%s", method)
144
+ send(method)
145
+ }
146
+ line = sprintf(@format, *arguments)
147
+
148
+ width = get_width
149
+ if line.length == width - 1
150
+ @out.print(line + eol)
151
+ @out.flush
152
+ elsif line.length >= width
153
+ @terminal_width = [@terminal_width - (line.length - width + 1), 0].max
154
+ if @terminal_width == 0 then @out.print(line + eol) else show end
155
+ else # line.length < width - 1
156
+ @terminal_width += width - line.length + 1
157
+ show
158
+ end
159
+ @previous_time = Time.now
160
+ end
161
+
162
+ def show_if_needed
163
+ if @total.zero?
164
+ cur_percentage = 100
165
+ prev_percentage = 0
166
+ else
167
+ cur_percentage = (@current * 100 / @total).to_i
168
+ prev_percentage = (@previous * 100 / @total).to_i
169
+ end
170
+
171
+ # Use "!=" instead of ">" to support negative changes
172
+ if cur_percentage != prev_percentage ||
173
+ Time.now - @previous_time >= 1 || @finished_p
174
+ show
175
+ end
176
+ end
177
+
178
+ public
179
+ def clear
180
+ @out.print "\r"
181
+ @out.print(" " * (get_width - 1))
182
+ @out.print "\r"
183
+ end
184
+
185
+ def finish
186
+ @current = @total
187
+ @finished_p = true
188
+ show
189
+ end
190
+
191
+ def finished?
192
+ @finished_p
193
+ end
194
+
195
+ def file_transfer_mode
196
+ @format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer]
197
+ end
198
+
199
+ def format= (format)
200
+ @format = format
201
+ end
202
+
203
+ def format_arguments= (arguments)
204
+ @format_arguments = arguments
205
+ end
206
+
207
+ def halt
208
+ @finished_p = true
209
+ show
210
+ end
211
+
212
+ def inc (step = 1)
213
+ @current += step
214
+ @current = @total if @current > @total
215
+ show_if_needed
216
+ @previous = @current
217
+ end
218
+
219
+ def set (count)
220
+ if count < 0 || count > @total
221
+ raise "invalid count: #{count} (total: #{@total})"
222
+ end
223
+ @current = count
224
+ show_if_needed
225
+ @previous = @current
226
+ end
227
+
228
+ def inspect
229
+ "#<ProgressBar:#{@current}/#{@total}>"
230
+ end
231
+ end
232
+
233
+ class ReversedProgressBar < ProgressBar
234
+ def do_percentage
235
+ 100 - super
236
+ end
237
+ end
238
+
239
+ ##########################################
240
+
241
+ class NewProgressBar < ProgressBar
242
+
243
+ attr_accessor :nhits, :nsentences
244
+
245
+ alias org_initialize initialize
246
+
247
+ def initialize(filename, size)
248
+ @nhits = 0
249
+ @nsentences = 0
250
+ org_initialize(File.basename(filename), size)
251
+ end
252
+
253
+ alias org_fmt_stat fmt_stat
254
+
255
+ def fmt_stat
256
+ org_fmt_stat # + " Hits: " + @nhits.to_s
257
+ end
258
+ end
259
+
260
+ class CmdProgbar
261
+
262
+ attr_accessor :pbar, :last_value
263
+
264
+ def initialize
265
+ @last_value = 0
266
+ @pbar = nil
267
+ end
268
+
269
+ def msg(str, i = nil)
270
+ case i
271
+ when 0
272
+ print str
273
+ else
274
+ puts str
275
+ end
276
+ end
277
+
278
+ def prg_update(value, elt, eta)
279
+ @elt = elt
280
+ @eta = eta
281
+ offset = value - @last_value
282
+ @pbar.inc(offset.to_i)
283
+ @last_value = value
284
+ end
285
+
286
+ def data_set(filename, linesize)
287
+ @pbar = NewProgressBar.new(filename, linesize)
288
+ end
289
+
290
+ def data_update(nhits, nsentences)
291
+ @pbar.nhits = nhits
292
+ @pbar.nsentences = nsentences
293
+ end
294
+
295
+ def before
296
+ initialize
297
+ end
298
+
299
+ def after
300
+ @pbar.finish
301
+ return true
302
+ end
303
+
304
+ end
305
+ end