wp2txt 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -11
- data/bin/benchmark.rb +14 -10
- data/bin/wp2txt +48 -28
- data/lib/wp2txt.rb +46 -11
- data/lib/wp2txt/article.rb +49 -89
- data/lib/wp2txt/mw_api.rb +0 -0
- data/lib/wp2txt/utils.rb +174 -112
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +60 -41
- data/wp2txt.gemspec +3 -9
- metadata +3 -59
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80f68e6c1ac855160575f85f4d78ca378f0a1c2b
|
4
|
+
data.tar.gz: 16bbac80e7139ea63dd46baf54fb5deaf0840e59
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 004d26fa39aae4eb194858cf85ae8aad33f65dc556a08bbfc499ead05d49e70af4f5ba5e708354aa816cd6b38d8e9860866cefa7d6c0730058e9a186ff9eec31
|
7
|
+
data.tar.gz: c2523b8afeab165c37de028eedff36e719a2472f9440469e4041c342b08463d439351a89523d959ff28d53364c76a2af44502113bb2084eacbbc8ac14306f8a4
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compres
|
|
8
8
|
|
9
9
|
### Features ###
|
10
10
|
|
11
|
-
* Convert dump files of Wikipedia of
|
11
|
+
* Convert dump files of Wikipedia of various languages (I hope).
|
12
12
|
* Create output files of specified size.
|
13
13
|
* Allow users to specify text elements to be extracted/converted (page titles, section titles, lists, and tables).
|
14
14
|
|
@@ -16,12 +16,6 @@ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compres
|
|
16
16
|
|
17
17
|
$ gem install wp2txt
|
18
18
|
|
19
|
-
It is highly recommended you also install bz2-ruby gem. See the following for the details about bz2-ruby gem:
|
20
|
-
|
21
|
-
[https://github.com/brianmario/bzip2-ruby](https://github.com/brianmario/bzip2-ruby)
|
22
|
-
|
23
|
-
When the above gem is not found, wp2txt will try to use bzip2 program in your command line environment. Supposedly he former option is more reliable as well as fast.
|
24
|
-
|
25
19
|
### Usage
|
26
20
|
|
27
21
|
Obtain a Wikipedia dump file (from [here](http://dumps.wikimedia.org/backup-index.html)) with a file name such as:
|
@@ -32,10 +26,10 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
|
|
32
26
|
|
33
27
|
Command line options are as follows:
|
34
28
|
|
35
|
-
*CAUTION:*
|
29
|
+
*CAUTION:* Command line options in the current version have been drastically changed from previous versions.
|
36
30
|
|
37
|
-
|
38
|
-
|
31
|
+
Usage: wp2txt [options]
|
32
|
+
where [options] are:
|
39
33
|
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or
|
40
34
|
.txt (uncompressed) format
|
41
35
|
--output-dir, -o <s>: Output directory (default:
|
@@ -46,13 +40,15 @@ Command line options are as follows:
|
|
46
40
|
--heading, --no-heading, -d: Show section titles in output (default: true)
|
47
41
|
--title, --no-title, -t: Show page titles in output (default: true)
|
48
42
|
--table, -a: Show table source code in output
|
49
|
-
--template, -e:
|
43
|
+
--template, -e: leave inline template notations unmodified
|
50
44
|
--redirect, -r: Show redirect destination
|
51
45
|
--marker, --no-marker, -m: Show symbols prefixed to list items,
|
52
46
|
definitions, etc. (Default: true)
|
53
47
|
--category, -g: Show article category information
|
54
48
|
--file-size, -f <i>: Approximate size (in MB) of each output file
|
55
49
|
(default: 10)
|
50
|
+
--limit-recur, -u <i>: Max number of recursive call (0 to 10)
|
51
|
+
(default: 10)
|
56
52
|
--version, -v: Print version and exit
|
57
53
|
--help, -h: Show this message
|
58
54
|
|
@@ -71,6 +67,11 @@ Command line options are as follows:
|
|
71
67
|
|
72
68
|
* Yoichiro Hasebe (<yohasebe@gmail.com>)
|
73
69
|
|
70
|
+
### References ###
|
71
|
+
|
72
|
+
* Yoichiro HASEBE. 2006. [Method for using Wikipedia as Japanese corpus.](http://ci.nii.ac.jp/naid/110006226727) _Doshisha Studies in Language and Culture_ 9(2), 373-403.
|
73
|
+
* 長谷部陽一郎. 2006. [Wikipedia日本語版をコーパスとして用いた言語研究の手法](http://ci.nii.ac.jp/naid/110006226727). 『言語文化』9(2), 373-403.
|
74
|
+
|
74
75
|
### License ###
|
75
76
|
|
76
77
|
This software is distributed under the MIT License. Please see the LICENSE file.
|
data/bin/benchmark.rb
CHANGED
@@ -18,13 +18,11 @@ tfile_size = 10
|
|
18
18
|
convert = true
|
19
19
|
strip_tmarker = true
|
20
20
|
|
21
|
-
|
22
|
-
|
23
21
|
Benchmark.bm do |x|
|
24
22
|
x.report do
|
25
23
|
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
26
24
|
wpconv.extract_text do |article|
|
27
|
-
title = format_wiki article.title
|
25
|
+
title = format_wiki! article.title
|
28
26
|
title = "[[#{title}]]\n"
|
29
27
|
|
30
28
|
contents = "\nCATEGORIES: "
|
@@ -34,25 +32,31 @@ Benchmark.bm do |x|
|
|
34
32
|
article.elements.each do |e|
|
35
33
|
case e.first
|
36
34
|
when :mw_heading
|
37
|
-
|
35
|
+
format_wiki!(e.last)
|
36
|
+
line = e.last
|
38
37
|
when :mw_paragraph
|
39
|
-
|
38
|
+
format_wiki!(e.last)
|
39
|
+
line = e.last
|
40
40
|
when :mw_table, :mw_htable
|
41
|
-
|
41
|
+
format_wiki!(e.last)
|
42
|
+
line = e.last
|
42
43
|
when :mw_pre
|
43
44
|
line = e.last
|
44
45
|
when :mw_quote
|
45
|
-
|
46
|
+
format_wiki!(e.last)
|
47
|
+
line = e.last
|
46
48
|
when :mw_unordered, :mw_ordered, :mw_definition
|
47
|
-
|
49
|
+
format_wiki!(e.last)
|
50
|
+
line = e.last
|
48
51
|
when :mw_redirect
|
49
|
-
|
52
|
+
format_wiki!(e.last)
|
53
|
+
line = e.last
|
50
54
|
line += "\n\n"
|
51
55
|
else
|
52
56
|
next
|
53
57
|
end
|
54
58
|
contents += line
|
55
|
-
|
59
|
+
remove_templates!(contents)
|
56
60
|
end
|
57
61
|
|
58
62
|
##### cleanup #####
|
data/bin/wp2txt
CHANGED
@@ -31,39 +31,41 @@ EOS
|
|
31
31
|
opt :heading, "Show section titles in output", :default => true, :short => "-d"
|
32
32
|
opt :title, "Show page titles in output", :default => true
|
33
33
|
opt :table, "Show table source code in output", :default => false
|
34
|
-
opt :template, "
|
34
|
+
opt :template, "leave inline template notations unmodified", :default => false
|
35
35
|
opt :redirect, "Show redirect destination", :default => false
|
36
36
|
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
37
37
|
opt :category, "Show article category information", :default => false
|
38
|
-
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
38
|
+
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
39
|
+
opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10
|
39
40
|
end
|
40
41
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
41
42
|
Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
43
|
+
Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10
|
42
44
|
|
43
45
|
input_file = ARGV[0]
|
44
46
|
output_dir = opts[:output_dir]
|
45
47
|
tfile_size = opts[:file_size]
|
48
|
+
limit_recur = opts[:limit_recur]
|
46
49
|
convert = opts[:convert]
|
47
50
|
strip_tmarker = opts[:marker] ? false : true
|
48
|
-
opt_array = [:title, :list, :heading, :table, :
|
51
|
+
opt_array = [:title, :list, :heading, :table, :redirect]
|
52
|
+
$leave_template = true if opts[:template]
|
49
53
|
config = {}
|
50
54
|
opt_array.each do |opt|
|
51
55
|
config[opt] = opts[opt]
|
52
56
|
end
|
53
57
|
|
54
|
-
# a "parent" is either commandline progress bar or
|
55
|
-
# a gui window (not available for now)
|
56
58
|
parent = Wp2txt::CmdProgbar.new
|
57
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
59
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker, limit_recur)
|
58
60
|
|
59
61
|
wpconv.extract_text do |article|
|
60
|
-
|
61
|
-
title = "[[#{title}]]\n"
|
62
|
+
format_wiki!(article.title)
|
63
|
+
title = "[[#{article.title}]]\n"
|
62
64
|
|
63
65
|
if opts[:category] && !article.categories.empty?
|
64
66
|
contents = "\nCATEGORIES: "
|
65
|
-
contents
|
66
|
-
contents
|
67
|
+
contents << article.categories.join(", ")
|
68
|
+
contents << "\n\n"
|
67
69
|
else
|
68
70
|
contents = ""
|
69
71
|
end
|
@@ -72,44 +74,62 @@ wpconv.extract_text do |article|
|
|
72
74
|
case e.first
|
73
75
|
when :mw_heading
|
74
76
|
next if !config[:heading]
|
75
|
-
|
76
|
-
line
|
77
|
+
format_wiki!(e.last)
|
78
|
+
line = e.last
|
79
|
+
line << "+HEADING+" if $DEBUG_MODE
|
77
80
|
when :mw_paragraph
|
78
81
|
# next if !config[:paragraph]
|
79
|
-
|
80
|
-
line
|
82
|
+
format_wiki!(e.last)
|
83
|
+
line = e.last
|
84
|
+
line << "+PARAGRAPH+" if $DEBUG_MODE
|
81
85
|
when :mw_table, :mw_htable
|
82
86
|
next if !config[:table]
|
83
|
-
|
84
|
-
line
|
87
|
+
format_wiki!(e.last)
|
88
|
+
line = e.last
|
89
|
+
line << "+TABLE+" if $DEBUG_MODE
|
85
90
|
when :mw_pre
|
86
91
|
next if !config[:pre]
|
87
92
|
line = e.last
|
88
|
-
line
|
93
|
+
line << "+PRE+" if $DEBUG_MODE
|
89
94
|
when :mw_quote
|
90
95
|
# next if !config[:quote]
|
91
|
-
|
92
|
-
line
|
96
|
+
format_wiki!(e.last)
|
97
|
+
line = e.last
|
98
|
+
line << "+QUOTE+" if $DEBUG_MODE
|
93
99
|
when :mw_unordered, :mw_ordered, :mw_definition
|
94
100
|
next if !config[:list]
|
95
|
-
|
96
|
-
line
|
101
|
+
format_wiki!(e.last)
|
102
|
+
line = e.last
|
103
|
+
line << "+LIST+" if $DEBUG_MODE
|
97
104
|
when :mw_redirect
|
98
105
|
next if !config[:redirect]
|
99
|
-
|
100
|
-
line
|
101
|
-
line
|
106
|
+
format_wiki!(e.last)
|
107
|
+
line = e.last
|
108
|
+
line << "+REDIRECT+" if $DEBUG_MODE
|
109
|
+
line << "\n\n"
|
102
110
|
else
|
103
111
|
if $DEBUG_MODE
|
104
|
-
|
105
|
-
line
|
112
|
+
format_wiki!(e.last)
|
113
|
+
line = e.last
|
114
|
+
line << "+OTHER+"
|
106
115
|
else
|
107
116
|
next
|
108
117
|
end
|
109
118
|
end
|
110
|
-
contents
|
111
|
-
contents = remove_templates(contents) unless config[:template]
|
119
|
+
contents << line
|
112
120
|
end
|
121
|
+
|
122
|
+
remove_directive!(contents)
|
123
|
+
remove_emphasis!(contents)
|
124
|
+
mndash!(contents)
|
125
|
+
make_reference!(contents)
|
126
|
+
format_ref!(contents)
|
127
|
+
remove_hr!(contents)
|
128
|
+
remove_tag!(contents)
|
129
|
+
special_chr!(contents)
|
130
|
+
|
131
|
+
correct_inline_template!(contents) unless $leave_template
|
132
|
+
remove_templates!(contents) unless $leave_template
|
113
133
|
|
114
134
|
##### cleanup #####
|
115
135
|
if /\A\s*\z/m =~ contents
|
data/lib/wp2txt.rb
CHANGED
@@ -3,14 +3,18 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
-
require "rubygems"
|
7
|
-
require "bundler/setup"
|
8
|
-
require "nokogiri"
|
6
|
+
# require "rubygems"
|
7
|
+
# require "bundler/setup"
|
9
8
|
|
9
|
+
require "Nokogiri"
|
10
|
+
# require "oga"
|
11
|
+
# require "ox"
|
12
|
+
|
13
|
+
require 'pp'
|
10
14
|
require "wp2txt/article"
|
11
15
|
require "wp2txt/utils"
|
12
|
-
require "wp2txt/mw_api"
|
13
16
|
require "wp2txt/progressbar"
|
17
|
+
# require "wp2txt/mw_api"
|
14
18
|
|
15
19
|
begin
|
16
20
|
require "bzip2-ruby"
|
@@ -25,9 +29,7 @@ module Wp2txt
|
|
25
29
|
|
26
30
|
include Wp2txt
|
27
31
|
|
28
|
-
|
29
|
-
|
30
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
32
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10)
|
31
33
|
@parent = parent
|
32
34
|
@fp = nil
|
33
35
|
|
@@ -36,6 +38,9 @@ module Wp2txt
|
|
36
38
|
@tfile_size = tfile_size
|
37
39
|
@convert = convert
|
38
40
|
@strip_tmarker = strip_tmarker
|
41
|
+
|
42
|
+
#max number of recursive calls (global variable)
|
43
|
+
$limit_recur = limit_recur
|
39
44
|
end
|
40
45
|
|
41
46
|
def file_size(file)
|
@@ -111,7 +116,9 @@ module Wp2txt
|
|
111
116
|
else
|
112
117
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
113
118
|
end
|
119
|
+
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
114
120
|
@infile_size = file_size(file)
|
121
|
+
@parent.msg("... Done.", 1)
|
115
122
|
file.close # try to reopen since rewind method is unavailable
|
116
123
|
if RUBY_PLATFORM.index("win32")
|
117
124
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
@@ -237,13 +244,41 @@ module Wp2txt
|
|
237
244
|
while page = get_page
|
238
245
|
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
239
246
|
xml = xmlns + page + "</mediawiki>"
|
247
|
+
|
240
248
|
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
241
|
-
page = input.xpath("//xmlns:text").first
|
249
|
+
page = input.xpath("//xmlns:text").first
|
242
250
|
pp_title = page.parent.parent.at_css "title"
|
243
251
|
title = pp_title.content
|
244
|
-
|
245
|
-
next if /\:/ =~ title
|
252
|
+
next if /\:/ =~ title
|
246
253
|
text = page.content
|
254
|
+
|
255
|
+
# input = Oga.parse_xml(xml)
|
256
|
+
# page = input.xpath("//xmlns:text").first
|
257
|
+
# title = page.parent.parent.xpath("//xmlns:title").first.text
|
258
|
+
# next if /\:/ =~ title
|
259
|
+
# text = page.text
|
260
|
+
|
261
|
+
# input = Ox.load(xml, :encoding => "UTF-8")
|
262
|
+
# title = ""
|
263
|
+
# text = ""
|
264
|
+
# input.nodes.first.nodes.each do |n|
|
265
|
+
# if n.name == "title"
|
266
|
+
# title = n.nodes.first
|
267
|
+
# if /\:/ =~ title
|
268
|
+
# title = ""
|
269
|
+
# break
|
270
|
+
# end
|
271
|
+
# elsif n.name == "revision"
|
272
|
+
# n.nodes.each do |o|
|
273
|
+
# if o.name == "text"
|
274
|
+
# text = o.nodes.first
|
275
|
+
# break
|
276
|
+
# end
|
277
|
+
# end
|
278
|
+
# end
|
279
|
+
# end
|
280
|
+
# next if title == "" || text == ""
|
281
|
+
|
247
282
|
# remove all comment texts
|
248
283
|
# and insert as many number of new line chars included in
|
249
284
|
# each comment instead
|
@@ -256,7 +291,7 @@ module Wp2txt
|
|
256
291
|
end
|
257
292
|
end
|
258
293
|
|
259
|
-
@count ||= 0;@count += 1;
|
294
|
+
@count ||= 0;@count += 1;
|
260
295
|
|
261
296
|
article = Article.new(text, title, @strip_tmarker)
|
262
297
|
output_text += block.call(article)
|
data/lib/wp2txt/article.rb
CHANGED
@@ -3,77 +3,37 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
+
|
6
7
|
require 'strscan'
|
7
8
|
require 'utils'
|
8
9
|
|
9
10
|
module Wp2txt
|
10
11
|
|
11
12
|
# possible element type, which could be later chosen to print or not to print
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
13
|
+
# :mw_heading
|
14
|
+
# :mw_htable
|
15
|
+
# :mw_quote
|
16
|
+
# :mw_unordered
|
17
|
+
# :mw_ordered
|
18
|
+
# :mw_definition
|
19
|
+
# :mw_pre
|
20
|
+
# :mw_paragraph
|
21
|
+
# :mw_comment
|
22
|
+
# :mw_math
|
23
|
+
# :mw_source
|
24
|
+
# :mw_inputbox
|
25
|
+
# :mw_template
|
26
|
+
# :mw_link
|
27
|
+
# :mw_summary
|
28
|
+
# :mw_blank
|
29
|
+
# :mw_redirect
|
30
|
+
|
30
31
|
# an article contains elements, each of which is [TYPE, string]
|
31
32
|
class Article
|
32
33
|
|
33
34
|
include Wp2txt
|
34
35
|
attr_accessor :elements, :title, :categories
|
35
36
|
|
36
|
-
# class varialbes to save resource for generating regexps
|
37
|
-
# those with a trailing number 1 represent opening tag/markup
|
38
|
-
# those with a trailing number 2 represent closing tag/markup
|
39
|
-
# those without a trailing number contain both opening/closing tags/markups
|
40
|
-
|
41
|
-
@@in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
|
42
|
-
@@in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
43
|
-
|
44
|
-
@@in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
45
|
-
@@in_inputbox_regex1 = Regexp.new('<inputbox>')
|
46
|
-
@@in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
47
|
-
|
48
|
-
@@in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
49
|
-
@@in_source_regex1 = Regexp.new('<source.*?>')
|
50
|
-
@@in_source_regex2 = Regexp.new('<\/source>')
|
51
|
-
|
52
|
-
@@in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
53
|
-
@@in_math_regex1 = Regexp.new('<math.*?>')
|
54
|
-
@@in_math_regex2 = Regexp.new('<\/math>')
|
55
|
-
|
56
|
-
@@in_heading_regex = Regexp.new('^=+.*?=+$')
|
57
|
-
|
58
|
-
@@in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
59
|
-
@@in_html_table_regex1 = Regexp.new('<table\b')
|
60
|
-
@@in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
61
|
-
|
62
|
-
@@in_table_regex1 = Regexp.new('^\s*\{\|')
|
63
|
-
@@in_table_regex2 = Regexp.new('^\|\}.*?$')
|
64
|
-
|
65
|
-
@@in_unordered_regex = Regexp.new('^\*')
|
66
|
-
@@in_ordered_regex = Regexp.new('^\#')
|
67
|
-
@@in_pre_regex = Regexp.new('^ ')
|
68
|
-
@@in_definition_regex = Regexp.new('^[\;\:]')
|
69
|
-
|
70
|
-
@@blank_line_regex = Regexp.new('^\s*$')
|
71
|
-
|
72
|
-
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
-
|
74
|
-
category_patterns = ["Category", "Categoria"].join("|")
|
75
|
-
@@category_regex = Regexp.new('[\{\[\|\b](?:' + category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
76
|
-
|
77
37
|
def initialize(text, title = "", strip_tmarker = false)
|
78
38
|
@title = title.strip
|
79
39
|
@strip_tmarker = strip_tmarker
|
@@ -91,39 +51,39 @@ module Wp2txt
|
|
91
51
|
open_stack = []
|
92
52
|
close_stack = []
|
93
53
|
source.each_line do |line|
|
94
|
-
matched = line.scan(
|
54
|
+
matched = line.scan($category_regex)
|
95
55
|
if matched && !matched.empty?
|
96
56
|
@categories += matched
|
97
|
-
@categories
|
57
|
+
@categories.uniq!
|
98
58
|
end
|
99
59
|
|
100
60
|
case mode
|
101
61
|
when :mw_table
|
102
|
-
if
|
62
|
+
if $in_table_regex2 =~ line
|
103
63
|
mode = nil
|
104
64
|
end
|
105
65
|
@elements.last.last << line
|
106
66
|
next
|
107
67
|
when :mw_inputbox
|
108
|
-
if
|
68
|
+
if $in_inputbox_regex2 =~ line
|
109
69
|
mode = nil
|
110
70
|
end
|
111
71
|
@elements.last.last << line
|
112
72
|
next
|
113
73
|
when :mw_source
|
114
|
-
if
|
74
|
+
if $in_source_regex2 =~ line
|
115
75
|
mode = nil
|
116
76
|
end
|
117
77
|
@elements.last.last << line
|
118
78
|
next
|
119
79
|
when :mw_math
|
120
|
-
if
|
80
|
+
if $in_math_regex2 =~ line
|
121
81
|
mode = nil
|
122
82
|
end
|
123
83
|
@elements.last.last << line
|
124
84
|
next
|
125
85
|
when :mw_htable
|
126
|
-
if
|
86
|
+
if $in_html_table_regex2 =~ line
|
127
87
|
mode = nil
|
128
88
|
end
|
129
89
|
@elements.last.last << line
|
@@ -131,51 +91,51 @@ module Wp2txt
|
|
131
91
|
end
|
132
92
|
|
133
93
|
case line
|
134
|
-
when
|
94
|
+
when $blank_line_regex
|
135
95
|
@elements << create_element(:mw_blank, "\n")
|
136
|
-
when
|
96
|
+
when $redirect_regex
|
137
97
|
@elements << create_element(:mw_redirect, line)
|
138
|
-
when
|
98
|
+
when $in_template_regex
|
139
99
|
@elements << create_element(:mw_template, line)
|
140
|
-
when
|
141
|
-
line = line.sub(
|
100
|
+
when $in_heading_regex
|
101
|
+
line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
|
142
102
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
143
|
-
when
|
103
|
+
when $in_inputbox_regex
|
144
104
|
@elements << create_element(:mw_inputbox, line)
|
145
|
-
when
|
105
|
+
when $in_inputbox_regex1
|
146
106
|
mode = :mw_inputbox
|
147
107
|
@elements << create_element(:mw_inputbox, line)
|
148
|
-
when
|
108
|
+
when $in_source_regex
|
149
109
|
@elements << create_element(:mw_source, line)
|
150
|
-
when
|
110
|
+
when $in_source_regex1
|
151
111
|
mode = :mw_source
|
152
112
|
@elements << create_element(:mw_source, line)
|
153
|
-
when
|
113
|
+
when $in_math_regex
|
154
114
|
@elements << create_element(:mw_math, line)
|
155
|
-
when
|
115
|
+
when $in_math_regex1
|
156
116
|
mode = :mw_math
|
157
117
|
@elements << create_element(:mw_math, line)
|
158
|
-
when
|
118
|
+
when $in_html_table_regex
|
159
119
|
@elements << create_element(:mw_htable, line)
|
160
|
-
when
|
120
|
+
when $in_html_table_regex1
|
161
121
|
mode = :mw_htable
|
162
122
|
@elements << create_element(:mw_htable, line)
|
163
|
-
when
|
123
|
+
when $in_table_regex1
|
164
124
|
mode = :mw_table
|
165
125
|
@elements << create_element(:mw_table, line)
|
166
|
-
when
|
167
|
-
line = line.sub(
|
126
|
+
when $in_unordered_regex
|
127
|
+
line = line.sub($list_marks_regex, "") if @strip_tmarker
|
168
128
|
@elements << create_element(:mw_unordered, line)
|
169
|
-
when
|
170
|
-
line = line.sub(
|
129
|
+
when $in_ordered_regex
|
130
|
+
line = line.sub($list_marks_regex, "") if @strip_tmarker
|
171
131
|
@elements << create_element(:mw_ordered, line)
|
172
|
-
when
|
173
|
-
line = line.sub(
|
132
|
+
when $in_pre_regex
|
133
|
+
line = line.sub($pre_marks_regex, "") if @strip_tmarker
|
174
134
|
@elements << create_element(:mw_pre, line)
|
175
|
-
when
|
176
|
-
line = line.sub(
|
135
|
+
when $in_definition_regex
|
136
|
+
line = line.sub($def_marks_regex, "") if @strip_tmarker
|
177
137
|
@elements << create_element(:mw_definition, line)
|
178
|
-
when
|
138
|
+
when $in_link_regex
|
179
139
|
@elements << create_element(:mw_link, line)
|
180
140
|
else
|
181
141
|
@elements << create_element(:mw_paragraph, line)
|