wp2txt 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -11
- data/bin/benchmark.rb +14 -10
- data/bin/wp2txt +48 -28
- data/lib/wp2txt.rb +46 -11
- data/lib/wp2txt/article.rb +49 -89
- data/lib/wp2txt/mw_api.rb +0 -0
- data/lib/wp2txt/utils.rb +174 -112
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +60 -41
- data/wp2txt.gemspec +3 -9
- metadata +3 -59
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80f68e6c1ac855160575f85f4d78ca378f0a1c2b
|
4
|
+
data.tar.gz: 16bbac80e7139ea63dd46baf54fb5deaf0840e59
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 004d26fa39aae4eb194858cf85ae8aad33f65dc556a08bbfc499ead05d49e70af4f5ba5e708354aa816cd6b38d8e9860866cefa7d6c0730058e9a186ff9eec31
|
7
|
+
data.tar.gz: c2523b8afeab165c37de028eedff36e719a2472f9440469e4041c342b08463d439351a89523d959ff28d53364c76a2af44502113bb2084eacbbc8ac14306f8a4
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compres
|
|
8
8
|
|
9
9
|
### Features ###
|
10
10
|
|
11
|
-
* Convert dump files of Wikipedia of
|
11
|
+
* Convert dump files of Wikipedia of various languages (I hope).
|
12
12
|
* Create output files of specified size.
|
13
13
|
* Allow users to specify text elements to be extracted/converted (page titles, section titles, lists, and tables).
|
14
14
|
|
@@ -16,12 +16,6 @@ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compres
|
|
16
16
|
|
17
17
|
$ gem install wp2txt
|
18
18
|
|
19
|
-
It is highly recommended you also install bz2-ruby gem. See the following for the details about bz2-ruby gem:
|
20
|
-
|
21
|
-
[https://github.com/brianmario/bzip2-ruby](https://github.com/brianmario/bzip2-ruby)
|
22
|
-
|
23
|
-
When the above gem is not found, wp2txt will try to use bzip2 program in your command line environment. Supposedly he former option is more reliable as well as fast.
|
24
|
-
|
25
19
|
### Usage
|
26
20
|
|
27
21
|
Obtain a Wikipedia dump file (from [here](http://dumps.wikimedia.org/backup-index.html)) with a file name such as:
|
@@ -32,10 +26,10 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
|
|
32
26
|
|
33
27
|
Command line options are as follows:
|
34
28
|
|
35
|
-
*CAUTION:*
|
29
|
+
*CAUTION:* Command line options in the current version have been drastically changed from previous versions.
|
36
30
|
|
37
|
-
|
38
|
-
|
31
|
+
Usage: wp2txt [options]
|
32
|
+
where [options] are:
|
39
33
|
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or
|
40
34
|
.txt (uncompressed) format
|
41
35
|
--output-dir, -o <s>: Output directory (default:
|
@@ -46,13 +40,15 @@ Command line options are as follows:
|
|
46
40
|
--heading, --no-heading, -d: Show section titles in output (default: true)
|
47
41
|
--title, --no-title, -t: Show page titles in output (default: true)
|
48
42
|
--table, -a: Show table source code in output
|
49
|
-
--template, -e:
|
43
|
+
--template, -e: leave inline template notations unmodified
|
50
44
|
--redirect, -r: Show redirect destination
|
51
45
|
--marker, --no-marker, -m: Show symbols prefixed to list items,
|
52
46
|
definitions, etc. (Default: true)
|
53
47
|
--category, -g: Show article category information
|
54
48
|
--file-size, -f <i>: Approximate size (in MB) of each output file
|
55
49
|
(default: 10)
|
50
|
+
--limit-recur, -u <i>: Max number of recursive call (0 to 10)
|
51
|
+
(default: 10)
|
56
52
|
--version, -v: Print version and exit
|
57
53
|
--help, -h: Show this message
|
58
54
|
|
@@ -71,6 +67,11 @@ Command line options are as follows:
|
|
71
67
|
|
72
68
|
* Yoichiro Hasebe (<yohasebe@gmail.com>)
|
73
69
|
|
70
|
+
### References ###
|
71
|
+
|
72
|
+
* Yoichiro HASEBE. 2006. [Method for using Wikipedia as Japanese corpus.](http://ci.nii.ac.jp/naid/110006226727) _Doshisha Studies in Language and Culture_ 9(2), 373-403.
|
73
|
+
* 長谷部陽一郎. 2006. [Wikipedia日本語版をコーパスとして用いた言語研究の手法](http://ci.nii.ac.jp/naid/110006226727). 『言語文化』9(2), 373-403.
|
74
|
+
|
74
75
|
### License ###
|
75
76
|
|
76
77
|
This software is distributed under the MIT License. Please see the LICENSE file.
|
data/bin/benchmark.rb
CHANGED
@@ -18,13 +18,11 @@ tfile_size = 10
|
|
18
18
|
convert = true
|
19
19
|
strip_tmarker = true
|
20
20
|
|
21
|
-
|
22
|
-
|
23
21
|
Benchmark.bm do |x|
|
24
22
|
x.report do
|
25
23
|
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
26
24
|
wpconv.extract_text do |article|
|
27
|
-
title = format_wiki article.title
|
25
|
+
title = format_wiki! article.title
|
28
26
|
title = "[[#{title}]]\n"
|
29
27
|
|
30
28
|
contents = "\nCATEGORIES: "
|
@@ -34,25 +32,31 @@ Benchmark.bm do |x|
|
|
34
32
|
article.elements.each do |e|
|
35
33
|
case e.first
|
36
34
|
when :mw_heading
|
37
|
-
|
35
|
+
format_wiki!(e.last)
|
36
|
+
line = e.last
|
38
37
|
when :mw_paragraph
|
39
|
-
|
38
|
+
format_wiki!(e.last)
|
39
|
+
line = e.last
|
40
40
|
when :mw_table, :mw_htable
|
41
|
-
|
41
|
+
format_wiki!(e.last)
|
42
|
+
line = e.last
|
42
43
|
when :mw_pre
|
43
44
|
line = e.last
|
44
45
|
when :mw_quote
|
45
|
-
|
46
|
+
format_wiki!(e.last)
|
47
|
+
line = e.last
|
46
48
|
when :mw_unordered, :mw_ordered, :mw_definition
|
47
|
-
|
49
|
+
format_wiki!(e.last)
|
50
|
+
line = e.last
|
48
51
|
when :mw_redirect
|
49
|
-
|
52
|
+
format_wiki!(e.last)
|
53
|
+
line = e.last
|
50
54
|
line += "\n\n"
|
51
55
|
else
|
52
56
|
next
|
53
57
|
end
|
54
58
|
contents += line
|
55
|
-
|
59
|
+
remove_templates!(contents)
|
56
60
|
end
|
57
61
|
|
58
62
|
##### cleanup #####
|
data/bin/wp2txt
CHANGED
@@ -31,39 +31,41 @@ EOS
|
|
31
31
|
opt :heading, "Show section titles in output", :default => true, :short => "-d"
|
32
32
|
opt :title, "Show page titles in output", :default => true
|
33
33
|
opt :table, "Show table source code in output", :default => false
|
34
|
-
opt :template, "
|
34
|
+
opt :template, "leave inline template notations unmodified", :default => false
|
35
35
|
opt :redirect, "Show redirect destination", :default => false
|
36
36
|
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
37
37
|
opt :category, "Show article category information", :default => false
|
38
|
-
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
38
|
+
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
39
|
+
opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10
|
39
40
|
end
|
40
41
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
41
42
|
Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
43
|
+
Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10
|
42
44
|
|
43
45
|
input_file = ARGV[0]
|
44
46
|
output_dir = opts[:output_dir]
|
45
47
|
tfile_size = opts[:file_size]
|
48
|
+
limit_recur = opts[:limit_recur]
|
46
49
|
convert = opts[:convert]
|
47
50
|
strip_tmarker = opts[:marker] ? false : true
|
48
|
-
opt_array = [:title, :list, :heading, :table, :
|
51
|
+
opt_array = [:title, :list, :heading, :table, :redirect]
|
52
|
+
$leave_template = true if opts[:template]
|
49
53
|
config = {}
|
50
54
|
opt_array.each do |opt|
|
51
55
|
config[opt] = opts[opt]
|
52
56
|
end
|
53
57
|
|
54
|
-
# a "parent" is either commandline progress bar or
|
55
|
-
# a gui window (not available for now)
|
56
58
|
parent = Wp2txt::CmdProgbar.new
|
57
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
59
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker, limit_recur)
|
58
60
|
|
59
61
|
wpconv.extract_text do |article|
|
60
|
-
|
61
|
-
title = "[[#{title}]]\n"
|
62
|
+
format_wiki!(article.title)
|
63
|
+
title = "[[#{article.title}]]\n"
|
62
64
|
|
63
65
|
if opts[:category] && !article.categories.empty?
|
64
66
|
contents = "\nCATEGORIES: "
|
65
|
-
contents
|
66
|
-
contents
|
67
|
+
contents << article.categories.join(", ")
|
68
|
+
contents << "\n\n"
|
67
69
|
else
|
68
70
|
contents = ""
|
69
71
|
end
|
@@ -72,44 +74,62 @@ wpconv.extract_text do |article|
|
|
72
74
|
case e.first
|
73
75
|
when :mw_heading
|
74
76
|
next if !config[:heading]
|
75
|
-
|
76
|
-
line
|
77
|
+
format_wiki!(e.last)
|
78
|
+
line = e.last
|
79
|
+
line << "+HEADING+" if $DEBUG_MODE
|
77
80
|
when :mw_paragraph
|
78
81
|
# next if !config[:paragraph]
|
79
|
-
|
80
|
-
line
|
82
|
+
format_wiki!(e.last)
|
83
|
+
line = e.last
|
84
|
+
line << "+PARAGRAPH+" if $DEBUG_MODE
|
81
85
|
when :mw_table, :mw_htable
|
82
86
|
next if !config[:table]
|
83
|
-
|
84
|
-
line
|
87
|
+
format_wiki!(e.last)
|
88
|
+
line = e.last
|
89
|
+
line << "+TABLE+" if $DEBUG_MODE
|
85
90
|
when :mw_pre
|
86
91
|
next if !config[:pre]
|
87
92
|
line = e.last
|
88
|
-
line
|
93
|
+
line << "+PRE+" if $DEBUG_MODE
|
89
94
|
when :mw_quote
|
90
95
|
# next if !config[:quote]
|
91
|
-
|
92
|
-
line
|
96
|
+
format_wiki!(e.last)
|
97
|
+
line = e.last
|
98
|
+
line << "+QUOTE+" if $DEBUG_MODE
|
93
99
|
when :mw_unordered, :mw_ordered, :mw_definition
|
94
100
|
next if !config[:list]
|
95
|
-
|
96
|
-
line
|
101
|
+
format_wiki!(e.last)
|
102
|
+
line = e.last
|
103
|
+
line << "+LIST+" if $DEBUG_MODE
|
97
104
|
when :mw_redirect
|
98
105
|
next if !config[:redirect]
|
99
|
-
|
100
|
-
line
|
101
|
-
line
|
106
|
+
format_wiki!(e.last)
|
107
|
+
line = e.last
|
108
|
+
line << "+REDIRECT+" if $DEBUG_MODE
|
109
|
+
line << "\n\n"
|
102
110
|
else
|
103
111
|
if $DEBUG_MODE
|
104
|
-
|
105
|
-
line
|
112
|
+
format_wiki!(e.last)
|
113
|
+
line = e.last
|
114
|
+
line << "+OTHER+"
|
106
115
|
else
|
107
116
|
next
|
108
117
|
end
|
109
118
|
end
|
110
|
-
contents
|
111
|
-
contents = remove_templates(contents) unless config[:template]
|
119
|
+
contents << line
|
112
120
|
end
|
121
|
+
|
122
|
+
remove_directive!(contents)
|
123
|
+
remove_emphasis!(contents)
|
124
|
+
mndash!(contents)
|
125
|
+
make_reference!(contents)
|
126
|
+
format_ref!(contents)
|
127
|
+
remove_hr!(contents)
|
128
|
+
remove_tag!(contents)
|
129
|
+
special_chr!(contents)
|
130
|
+
|
131
|
+
correct_inline_template!(contents) unless $leave_template
|
132
|
+
remove_templates!(contents) unless $leave_template
|
113
133
|
|
114
134
|
##### cleanup #####
|
115
135
|
if /\A\s*\z/m =~ contents
|
data/lib/wp2txt.rb
CHANGED
@@ -3,14 +3,18 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
-
require "rubygems"
|
7
|
-
require "bundler/setup"
|
8
|
-
require "nokogiri"
|
6
|
+
# require "rubygems"
|
7
|
+
# require "bundler/setup"
|
9
8
|
|
9
|
+
require "Nokogiri"
|
10
|
+
# require "oga"
|
11
|
+
# require "ox"
|
12
|
+
|
13
|
+
require 'pp'
|
10
14
|
require "wp2txt/article"
|
11
15
|
require "wp2txt/utils"
|
12
|
-
require "wp2txt/mw_api"
|
13
16
|
require "wp2txt/progressbar"
|
17
|
+
# require "wp2txt/mw_api"
|
14
18
|
|
15
19
|
begin
|
16
20
|
require "bzip2-ruby"
|
@@ -25,9 +29,7 @@ module Wp2txt
|
|
25
29
|
|
26
30
|
include Wp2txt
|
27
31
|
|
28
|
-
|
29
|
-
|
30
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
32
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10)
|
31
33
|
@parent = parent
|
32
34
|
@fp = nil
|
33
35
|
|
@@ -36,6 +38,9 @@ module Wp2txt
|
|
36
38
|
@tfile_size = tfile_size
|
37
39
|
@convert = convert
|
38
40
|
@strip_tmarker = strip_tmarker
|
41
|
+
|
42
|
+
#max number of recursive calls (global variable)
|
43
|
+
$limit_recur = limit_recur
|
39
44
|
end
|
40
45
|
|
41
46
|
def file_size(file)
|
@@ -111,7 +116,9 @@ module Wp2txt
|
|
111
116
|
else
|
112
117
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
113
118
|
end
|
119
|
+
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
114
120
|
@infile_size = file_size(file)
|
121
|
+
@parent.msg("... Done.", 1)
|
115
122
|
file.close # try to reopen since rewind method is unavailable
|
116
123
|
if RUBY_PLATFORM.index("win32")
|
117
124
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
@@ -237,13 +244,41 @@ module Wp2txt
|
|
237
244
|
while page = get_page
|
238
245
|
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
239
246
|
xml = xmlns + page + "</mediawiki>"
|
247
|
+
|
240
248
|
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
241
|
-
page = input.xpath("//xmlns:text").first
|
249
|
+
page = input.xpath("//xmlns:text").first
|
242
250
|
pp_title = page.parent.parent.at_css "title"
|
243
251
|
title = pp_title.content
|
244
|
-
|
245
|
-
next if /\:/ =~ title
|
252
|
+
next if /\:/ =~ title
|
246
253
|
text = page.content
|
254
|
+
|
255
|
+
# input = Oga.parse_xml(xml)
|
256
|
+
# page = input.xpath("//xmlns:text").first
|
257
|
+
# title = page.parent.parent.xpath("//xmlns:title").first.text
|
258
|
+
# next if /\:/ =~ title
|
259
|
+
# text = page.text
|
260
|
+
|
261
|
+
# input = Ox.load(xml, :encoding => "UTF-8")
|
262
|
+
# title = ""
|
263
|
+
# text = ""
|
264
|
+
# input.nodes.first.nodes.each do |n|
|
265
|
+
# if n.name == "title"
|
266
|
+
# title = n.nodes.first
|
267
|
+
# if /\:/ =~ title
|
268
|
+
# title = ""
|
269
|
+
# break
|
270
|
+
# end
|
271
|
+
# elsif n.name == "revision"
|
272
|
+
# n.nodes.each do |o|
|
273
|
+
# if o.name == "text"
|
274
|
+
# text = o.nodes.first
|
275
|
+
# break
|
276
|
+
# end
|
277
|
+
# end
|
278
|
+
# end
|
279
|
+
# end
|
280
|
+
# next if title == "" || text == ""
|
281
|
+
|
247
282
|
# remove all comment texts
|
248
283
|
# and insert as many number of new line chars included in
|
249
284
|
# each comment instead
|
@@ -256,7 +291,7 @@ module Wp2txt
|
|
256
291
|
end
|
257
292
|
end
|
258
293
|
|
259
|
-
@count ||= 0;@count += 1;
|
294
|
+
@count ||= 0;@count += 1;
|
260
295
|
|
261
296
|
article = Article.new(text, title, @strip_tmarker)
|
262
297
|
output_text += block.call(article)
|
data/lib/wp2txt/article.rb
CHANGED
@@ -3,77 +3,37 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
+
|
6
7
|
require 'strscan'
|
7
8
|
require 'utils'
|
8
9
|
|
9
10
|
module Wp2txt
|
10
11
|
|
11
12
|
# possible element type, which could be later chosen to print or not to print
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
13
|
+
# :mw_heading
|
14
|
+
# :mw_htable
|
15
|
+
# :mw_quote
|
16
|
+
# :mw_unordered
|
17
|
+
# :mw_ordered
|
18
|
+
# :mw_definition
|
19
|
+
# :mw_pre
|
20
|
+
# :mw_paragraph
|
21
|
+
# :mw_comment
|
22
|
+
# :mw_math
|
23
|
+
# :mw_source
|
24
|
+
# :mw_inputbox
|
25
|
+
# :mw_template
|
26
|
+
# :mw_link
|
27
|
+
# :mw_summary
|
28
|
+
# :mw_blank
|
29
|
+
# :mw_redirect
|
30
|
+
|
30
31
|
# an article contains elements, each of which is [TYPE, string]
|
31
32
|
class Article
|
32
33
|
|
33
34
|
include Wp2txt
|
34
35
|
attr_accessor :elements, :title, :categories
|
35
36
|
|
36
|
-
# class varialbes to save resource for generating regexps
|
37
|
-
# those with a trailing number 1 represent opening tag/markup
|
38
|
-
# those with a trailing number 2 represent closing tag/markup
|
39
|
-
# those without a trailing number contain both opening/closing tags/markups
|
40
|
-
|
41
|
-
@@in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
|
42
|
-
@@in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
43
|
-
|
44
|
-
@@in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
45
|
-
@@in_inputbox_regex1 = Regexp.new('<inputbox>')
|
46
|
-
@@in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
47
|
-
|
48
|
-
@@in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
49
|
-
@@in_source_regex1 = Regexp.new('<source.*?>')
|
50
|
-
@@in_source_regex2 = Regexp.new('<\/source>')
|
51
|
-
|
52
|
-
@@in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
53
|
-
@@in_math_regex1 = Regexp.new('<math.*?>')
|
54
|
-
@@in_math_regex2 = Regexp.new('<\/math>')
|
55
|
-
|
56
|
-
@@in_heading_regex = Regexp.new('^=+.*?=+$')
|
57
|
-
|
58
|
-
@@in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
59
|
-
@@in_html_table_regex1 = Regexp.new('<table\b')
|
60
|
-
@@in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
61
|
-
|
62
|
-
@@in_table_regex1 = Regexp.new('^\s*\{\|')
|
63
|
-
@@in_table_regex2 = Regexp.new('^\|\}.*?$')
|
64
|
-
|
65
|
-
@@in_unordered_regex = Regexp.new('^\*')
|
66
|
-
@@in_ordered_regex = Regexp.new('^\#')
|
67
|
-
@@in_pre_regex = Regexp.new('^ ')
|
68
|
-
@@in_definition_regex = Regexp.new('^[\;\:]')
|
69
|
-
|
70
|
-
@@blank_line_regex = Regexp.new('^\s*$')
|
71
|
-
|
72
|
-
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
-
|
74
|
-
category_patterns = ["Category", "Categoria"].join("|")
|
75
|
-
@@category_regex = Regexp.new('[\{\[\|\b](?:' + category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
76
|
-
|
77
37
|
def initialize(text, title = "", strip_tmarker = false)
|
78
38
|
@title = title.strip
|
79
39
|
@strip_tmarker = strip_tmarker
|
@@ -91,39 +51,39 @@ module Wp2txt
|
|
91
51
|
open_stack = []
|
92
52
|
close_stack = []
|
93
53
|
source.each_line do |line|
|
94
|
-
matched = line.scan(
|
54
|
+
matched = line.scan($category_regex)
|
95
55
|
if matched && !matched.empty?
|
96
56
|
@categories += matched
|
97
|
-
@categories
|
57
|
+
@categories.uniq!
|
98
58
|
end
|
99
59
|
|
100
60
|
case mode
|
101
61
|
when :mw_table
|
102
|
-
if
|
62
|
+
if $in_table_regex2 =~ line
|
103
63
|
mode = nil
|
104
64
|
end
|
105
65
|
@elements.last.last << line
|
106
66
|
next
|
107
67
|
when :mw_inputbox
|
108
|
-
if
|
68
|
+
if $in_inputbox_regex2 =~ line
|
109
69
|
mode = nil
|
110
70
|
end
|
111
71
|
@elements.last.last << line
|
112
72
|
next
|
113
73
|
when :mw_source
|
114
|
-
if
|
74
|
+
if $in_source_regex2 =~ line
|
115
75
|
mode = nil
|
116
76
|
end
|
117
77
|
@elements.last.last << line
|
118
78
|
next
|
119
79
|
when :mw_math
|
120
|
-
if
|
80
|
+
if $in_math_regex2 =~ line
|
121
81
|
mode = nil
|
122
82
|
end
|
123
83
|
@elements.last.last << line
|
124
84
|
next
|
125
85
|
when :mw_htable
|
126
|
-
if
|
86
|
+
if $in_html_table_regex2 =~ line
|
127
87
|
mode = nil
|
128
88
|
end
|
129
89
|
@elements.last.last << line
|
@@ -131,51 +91,51 @@ module Wp2txt
|
|
131
91
|
end
|
132
92
|
|
133
93
|
case line
|
134
|
-
when
|
94
|
+
when $blank_line_regex
|
135
95
|
@elements << create_element(:mw_blank, "\n")
|
136
|
-
when
|
96
|
+
when $redirect_regex
|
137
97
|
@elements << create_element(:mw_redirect, line)
|
138
|
-
when
|
98
|
+
when $in_template_regex
|
139
99
|
@elements << create_element(:mw_template, line)
|
140
|
-
when
|
141
|
-
line = line.sub(
|
100
|
+
when $in_heading_regex
|
101
|
+
line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
|
142
102
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
143
|
-
when
|
103
|
+
when $in_inputbox_regex
|
144
104
|
@elements << create_element(:mw_inputbox, line)
|
145
|
-
when
|
105
|
+
when $in_inputbox_regex1
|
146
106
|
mode = :mw_inputbox
|
147
107
|
@elements << create_element(:mw_inputbox, line)
|
148
|
-
when
|
108
|
+
when $in_source_regex
|
149
109
|
@elements << create_element(:mw_source, line)
|
150
|
-
when
|
110
|
+
when $in_source_regex1
|
151
111
|
mode = :mw_source
|
152
112
|
@elements << create_element(:mw_source, line)
|
153
|
-
when
|
113
|
+
when $in_math_regex
|
154
114
|
@elements << create_element(:mw_math, line)
|
155
|
-
when
|
115
|
+
when $in_math_regex1
|
156
116
|
mode = :mw_math
|
157
117
|
@elements << create_element(:mw_math, line)
|
158
|
-
when
|
118
|
+
when $in_html_table_regex
|
159
119
|
@elements << create_element(:mw_htable, line)
|
160
|
-
when
|
120
|
+
when $in_html_table_regex1
|
161
121
|
mode = :mw_htable
|
162
122
|
@elements << create_element(:mw_htable, line)
|
163
|
-
when
|
123
|
+
when $in_table_regex1
|
164
124
|
mode = :mw_table
|
165
125
|
@elements << create_element(:mw_table, line)
|
166
|
-
when
|
167
|
-
line = line.sub(
|
126
|
+
when $in_unordered_regex
|
127
|
+
line = line.sub($list_marks_regex, "") if @strip_tmarker
|
168
128
|
@elements << create_element(:mw_unordered, line)
|
169
|
-
when
|
170
|
-
line = line.sub(
|
129
|
+
when $in_ordered_regex
|
130
|
+
line = line.sub($list_marks_regex, "") if @strip_tmarker
|
171
131
|
@elements << create_element(:mw_ordered, line)
|
172
|
-
when
|
173
|
-
line = line.sub(
|
132
|
+
when $in_pre_regex
|
133
|
+
line = line.sub($pre_marks_regex, "") if @strip_tmarker
|
174
134
|
@elements << create_element(:mw_pre, line)
|
175
|
-
when
|
176
|
-
line = line.sub(
|
135
|
+
when $in_definition_regex
|
136
|
+
line = line.sub($def_marks_regex, "") if @strip_tmarker
|
177
137
|
@elements << create_element(:mw_definition, line)
|
178
|
-
when
|
138
|
+
when $in_link_regex
|
179
139
|
@elements << create_element(:mw_link, line)
|
180
140
|
else
|
181
141
|
@elements << create_element(:mw_paragraph, line)
|