wp2txt 1.0.2 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +34 -6
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +129 -155
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/bin/wp2txt
CHANGED
@@ -1,197 +1,192 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
-
|
7
|
-
$DEBUG_MODE = false
|
8
|
-
SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
|
9
|
-
DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
|
10
|
-
|
11
|
-
require 'wp2txt'
|
12
|
-
require 'wp2txt/utils'
|
13
|
-
require 'wp2txt/version'
|
14
|
-
require 'etc'
|
15
|
-
require 'optimist'
|
16
|
-
require 'parallel'
|
17
|
-
require 'pastel'
|
18
|
-
require 'tty-spinner'
|
19
|
-
|
20
|
-
include Wp2txt
|
21
|
-
|
22
|
-
opts = Optimist::options do
|
23
|
-
version Wp2txt::VERSION
|
24
|
-
banner <<-EOS
|
25
|
-
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
26
|
-
|
27
|
-
Usage: wp2txt [options]
|
28
|
-
where [options] are:
|
29
|
-
EOS
|
30
|
-
|
31
|
-
opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", :required => true, :short => "-i"
|
32
|
-
opt :output_dir, "Path to output directory", :default => Dir::pwd, :type => String, :short => "-o"
|
33
|
-
opt :convert, "Output in plain text (converting from XML)", :default => true, :short => "-c"
|
34
|
-
opt :category, "Show article category information", :default => true, :short => "-a"
|
35
|
-
opt :category_only, "Extract only article title and categories", :default => false, :short => "-g"
|
36
|
-
opt :summary_only, "Extract only article title, categories, and summary text before first heading", :default => false, :short => "-s"
|
37
|
-
opt :file_size, "Approximate size (in MB) of each output file", :default => 10, :short => "-f"
|
38
|
-
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", :short => "-n"
|
39
|
-
opt :del_interfile, "Delete intermediate XML files from output dir", :short => "-x", :default => false
|
40
|
-
opt :title, "Keep page titles in output", :default => true, :short => "-t"
|
41
|
-
opt :heading, "Keep section titles in output", :default => true, :short => "-d"
|
42
|
-
opt :list, "Keep unprocessed list items in output", :default => false, :short => "-l"
|
43
|
-
opt :ref, "Keep reference notations in the format [ref]...[/ref]", :default => false, :short => "-r"
|
44
|
-
opt :redirect, "Show redirect destination", :default => false, :short => "-e"
|
45
|
-
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true, :short => "-m"
|
46
|
-
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", :default => false, :short => "-b"
|
47
|
-
end
|
48
|
-
|
49
|
-
Optimist::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
50
|
-
Optimist::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
51
|
-
|
52
|
-
pastel = Pastel.new
|
53
2
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
DEBUG_MODE = false
|
6
|
+
SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
|
7
|
+
DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
|
8
|
+
|
9
|
+
require_relative "../lib/wp2txt"
|
10
|
+
require_relative "../lib/wp2txt/utils"
|
11
|
+
require_relative "../lib/wp2txt/version"
|
12
|
+
|
13
|
+
require "etc"
|
14
|
+
require "optimist"
|
15
|
+
require "parallel"
|
16
|
+
require "pastel"
|
17
|
+
require "tty-spinner"
|
18
|
+
|
19
|
+
class WpApp
|
20
|
+
include Wp2txt
|
21
|
+
|
22
|
+
def run
|
23
|
+
opts = Optimist.options do
|
24
|
+
version VERSION
|
25
|
+
banner <<~BANNER
|
26
|
+
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
27
|
+
|
28
|
+
Usage: wp2txt [options]
|
29
|
+
where [options] are:
|
30
|
+
BANNER
|
31
|
+
|
32
|
+
opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", type: String, required: true, short: "-i"
|
33
|
+
opt :output_dir, "Path to output directory", default: Dir.pwd, type: String, short: "-o"
|
34
|
+
opt :convert, "Output in plain text (converting from XML)", default: true, short: "-c"
|
35
|
+
opt :category, "Show article category information", default: true, short: "-a"
|
36
|
+
opt :category_only, "Extract only article title and categories", default: false, short: "-g"
|
37
|
+
opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
|
38
|
+
opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
|
39
|
+
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
|
40
|
+
opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
|
41
|
+
opt :title, "Keep page titles in output", default: true, short: "-t"
|
42
|
+
opt :heading, "Keep section titles in output", default: true, short: "-d"
|
43
|
+
opt :list, "Keep unprocessed list items in output", default: false, short: "-l"
|
44
|
+
opt :ref, "Keep reference notations in the format [ref]...[/ref]", default: false, short: "-r"
|
45
|
+
opt :redirect, "Show redirect destination", default: false, short: "-e"
|
46
|
+
opt :marker, "Show symbols prefixed to list items, definitions, etc.", default: true, short: "-m"
|
47
|
+
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", default: false, short: "-b"
|
48
|
+
end
|
86
49
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
50
|
+
Optimist.die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
51
|
+
Optimist.die :input, "must exist" unless File.exist?(opts[:input])
|
52
|
+
Optimist.die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
53
|
+
|
54
|
+
pastel = Pastel.new
|
55
|
+
|
56
|
+
input_file = opts[:input]
|
57
|
+
output_dir = opts[:output_dir]
|
58
|
+
tfile_size = opts[:file_size]
|
59
|
+
num_processors = Etc.nprocessors
|
60
|
+
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
|
61
|
+
opts[:num_procs]
|
62
|
+
else
|
63
|
+
num_processors - 2
|
64
|
+
end
|
65
|
+
num_processes = 1 if num_processes < 1
|
66
|
+
|
67
|
+
convert = opts[:convert]
|
68
|
+
strip_tmarker = opts[:marker] ? false : true
|
69
|
+
opt_array = %i[title list heading table redirect multiline category category_only summary_only del_interfile bz2_gem]
|
70
|
+
|
71
|
+
config = {}
|
72
|
+
opt_array.each do |opt|
|
73
|
+
config[opt] = opts[opt]
|
74
|
+
end
|
107
75
|
|
108
|
-
|
109
|
-
|
110
|
-
puts "Number of files being processed: " + pastel.bold("#{input_files.size}")
|
111
|
-
puts "Number of CPU cores being used: " + pastel.bold("#{num_processes}")
|
112
|
-
|
113
|
-
Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |input_file|
|
114
|
-
wpconv = Wp2txt::Runner.new(input_file, output_dir, strip_tmarker, config[:del_interfile])
|
115
|
-
wpconv.extract_text do |article|
|
116
|
-
format_wiki!(article.title)
|
117
|
-
|
118
|
-
if config[:category_only]
|
119
|
-
title = "#{article.title}\t"
|
120
|
-
contents = article.categories.join(", ")
|
121
|
-
contents << "\n"
|
122
|
-
elsif config[:category] && !article.categories.empty?
|
123
|
-
title = "\n[[#{article.title}]]\n\n"
|
124
|
-
contents = "\nCATEGORIES: "
|
125
|
-
contents << article.categories.join(", ")
|
126
|
-
contents << "\n\n"
|
76
|
+
if File.ftype(input_file) == "directory"
|
77
|
+
input_files = Dir.glob("#{input_file}/*.xml")
|
127
78
|
else
|
128
|
-
|
129
|
-
|
79
|
+
puts ""
|
80
|
+
puts pastel.green.bold("Preprocessing")
|
81
|
+
puts "Decompressing and splitting the original dump file."
|
82
|
+
puts pastel.underline("This may take a while. Please be patient!")
|
83
|
+
|
84
|
+
time_start = Time.now.to_i
|
85
|
+
wpsplitter = Splitter.new(input_file, output_dir, tfile_size)
|
86
|
+
spinner = TTY::Spinner.new(":spinner", format: :arrow_pulse, hide_cursor: true, interval: 5)
|
87
|
+
spinner.auto_spin
|
88
|
+
wpsplitter.split_file
|
89
|
+
time_finish = Time.now.to_i
|
90
|
+
|
91
|
+
spinner.stop("Time: #{sec_to_str(time_finish - time_start)}") # Stop animation
|
92
|
+
puts pastel.blue.bold("Complete!")
|
93
|
+
exit unless convert
|
94
|
+
input_files = Dir.glob("#{output_dir}/*.xml")
|
130
95
|
end
|
131
96
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
line = e.last
|
152
|
-
line << "+PRE+" if $DEBUG_MODE
|
153
|
-
when :mw_quote
|
154
|
-
line = e.last
|
155
|
-
line << "+QUOTE+" if $DEBUG_MODE
|
156
|
-
when :mw_unordered, :mw_ordered, :mw_definition
|
157
|
-
next if !config[:list]
|
158
|
-
line = e.last
|
159
|
-
line << "+LIST+" if $DEBUG_MODE
|
160
|
-
when :mw_ml_template
|
161
|
-
next if !config[:multiline]
|
162
|
-
line = e.last
|
163
|
-
line << "+MLTEMPLATE+" if $DEBUG_MODE
|
164
|
-
when :mw_redirect
|
165
|
-
next if !config[:redirect]
|
166
|
-
line = e.last
|
167
|
-
line << "+REDIRECT+" if $DEBUG_MODE
|
168
|
-
line << "\n\n"
|
169
|
-
when :mw_isolated_template
|
170
|
-
next if !config[:multiline]
|
171
|
-
line = e.last
|
172
|
-
line << "+ISOLATED_TEMPLATE+" if $DEBUG_MODE
|
173
|
-
when :mw_isolated_tag
|
174
|
-
next
|
97
|
+
puts ""
|
98
|
+
puts pastel.red.bold("Converting")
|
99
|
+
puts "Number of files being processed: " + pastel.bold(input_files.size.to_s)
|
100
|
+
puts "Number of CPU cores being used: " + pastel.bold(num_processes.to_s)
|
101
|
+
|
102
|
+
Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |infile|
|
103
|
+
wpconv = Runner.new(infile, output_dir, strip_tmarker, config[:del_interfile])
|
104
|
+
wpconv.extract_text do |article|
|
105
|
+
article.title = format_wiki(article.title, config)
|
106
|
+
|
107
|
+
if config[:category_only]
|
108
|
+
title = "#{article.title}\t"
|
109
|
+
contents = article.categories.join(", ")
|
110
|
+
contents << "\n"
|
111
|
+
elsif config[:category] && !article.categories.empty?
|
112
|
+
title = "\n[[#{article.title}]]\n\n"
|
113
|
+
contents = +"\nCATEGORIES: "
|
114
|
+
contents << article.categories.join(", ")
|
115
|
+
contents << "\n\n"
|
175
116
|
else
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
117
|
+
title = "\n[[#{article.title}]]\n\n"
|
118
|
+
contents = +""
|
119
|
+
end
|
120
|
+
|
121
|
+
unless config[:category_only]
|
122
|
+
article.elements.each do |e|
|
123
|
+
case e.first
|
124
|
+
when :mw_heading
|
125
|
+
break if config[:summary_only]
|
126
|
+
next unless config[:heading]
|
127
|
+
|
128
|
+
e[-1] = format_wiki(e.last, config)
|
129
|
+
line = e.last
|
130
|
+
line << "+HEADING+" if DEBUG_MODE
|
131
|
+
when :mw_paragraph
|
132
|
+
e[-1] = format_wiki(e.last, config)
|
133
|
+
line = e.last + "\n"
|
134
|
+
line << "+PARAGRAPH+" if DEBUG_MODE
|
135
|
+
when :mw_table, :mw_htable
|
136
|
+
next unless config[:table]
|
137
|
+
|
138
|
+
line = e.last
|
139
|
+
line << "+TABLE+" if DEBUG_MODE
|
140
|
+
when :mw_pre
|
141
|
+
next unless config[:pre]
|
142
|
+
|
143
|
+
line = e.last
|
144
|
+
line << "+PRE+" if DEBUG_MODE
|
145
|
+
when :mw_quote
|
146
|
+
line = e.last
|
147
|
+
line << "+QUOTE+" if DEBUG_MODE
|
148
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
149
|
+
next unless config[:list]
|
150
|
+
|
151
|
+
line = e.last
|
152
|
+
line << "+LIST+" if DEBUG_MODE
|
153
|
+
when :mw_ml_template
|
154
|
+
next unless config[:multiline]
|
155
|
+
|
156
|
+
line = e.last
|
157
|
+
line << "+MLTEMPLATE+" if DEBUG_MODE
|
158
|
+
when :mw_redirect
|
159
|
+
next unless config[:redirect]
|
160
|
+
|
161
|
+
line = e.last
|
162
|
+
line << "+REDIRECT+" if DEBUG_MODE
|
163
|
+
line << "\n\n"
|
164
|
+
when :mw_isolated_template
|
165
|
+
next unless config[:multiline]
|
166
|
+
|
167
|
+
line = e.last
|
168
|
+
line << "+ISOLATED_TEMPLATE+" if DEBUG_MODE
|
169
|
+
when :mw_isolated_tag
|
170
|
+
next
|
171
|
+
else
|
172
|
+
next unless DEBUG_MODE
|
173
|
+
|
174
|
+
line = e.last
|
175
|
+
line << "+OTHER+"
|
176
|
+
end
|
177
|
+
contents << line << "\n"
|
182
178
|
end
|
183
179
|
end
|
184
|
-
contents << line << "\n"
|
185
|
-
end
|
186
|
-
end
|
187
180
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
181
|
+
if /\A[\s ]*\z/m =~ contents
|
182
|
+
""
|
183
|
+
else
|
184
|
+
config[:title] ? title << contents : contents
|
185
|
+
end
|
186
|
+
end
|
192
187
|
end
|
188
|
+
puts pastel.blue.bold("Complete!")
|
193
189
|
end
|
194
190
|
end
|
195
191
|
|
196
|
-
|
197
|
-
|
192
|
+
WpApp.new.run
|
data/lib/wp2txt/article.rb
CHANGED
@@ -1,62 +1,54 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
|
1
|
+
# frozen_string_literal: true
|
6
2
|
|
7
3
|
require 'strscan'
|
8
|
-
|
4
|
+
require_relative 'utils'
|
9
5
|
|
10
6
|
module Wp2txt
|
11
|
-
|
12
7
|
# possible element type, which could be later chosen to print or not to print
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
8
|
+
# :mw_heading
|
9
|
+
# :mw_htable
|
10
|
+
# :mw_quote
|
11
|
+
# :mw_unordered
|
12
|
+
# :mw_ordered
|
13
|
+
# :mw_definition
|
14
|
+
# :mw_pre
|
15
|
+
# :mw_paragraph
|
16
|
+
# :mw_comment
|
17
|
+
# :mw_math
|
18
|
+
# :mw_source
|
19
|
+
# :mw_inputbox
|
20
|
+
# :mw_template
|
21
|
+
# :mw_link
|
22
|
+
# :mw_summary
|
23
|
+
# :mw_blank
|
24
|
+
# :mw_redirect
|
30
25
|
|
31
26
|
# an article contains elements, each of which is [TYPE, string]
|
32
27
|
class Article
|
33
|
-
|
34
28
|
include Wp2txt
|
35
29
|
attr_accessor :elements, :title, :categories
|
36
|
-
|
30
|
+
|
37
31
|
def initialize(text, title = "", strip_tmarker = false)
|
38
32
|
@title = title.strip
|
39
33
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters
|
41
|
-
text.gsub
|
42
|
-
remove_html
|
43
|
-
make_reference
|
44
|
-
remove_ref
|
34
|
+
text = convert_characters(text)
|
35
|
+
text = text.gsub(/\|\n\n+/m) { "|\n" }
|
36
|
+
text = remove_html(text)
|
37
|
+
text = make_reference(text)
|
38
|
+
text = remove_ref(text)
|
45
39
|
parse text
|
46
40
|
end
|
47
|
-
|
48
|
-
def create_element(
|
49
|
-
[
|
41
|
+
|
42
|
+
def create_element(tpx, text)
|
43
|
+
[tpx, text]
|
50
44
|
end
|
51
|
-
|
45
|
+
|
52
46
|
def parse(source)
|
53
47
|
@elements = []
|
54
|
-
@categories
|
48
|
+
@categories = []
|
55
49
|
mode = nil
|
56
|
-
open_stack = []
|
57
|
-
close_stack = []
|
58
50
|
source.each_line do |line|
|
59
|
-
matched = line.scan(
|
51
|
+
matched = line.scan(CATEGORY_REGEX)
|
60
52
|
if matched && !matched.empty?
|
61
53
|
@categories += matched
|
62
54
|
@categories.uniq!
|
@@ -65,108 +57,94 @@ module Wp2txt
|
|
65
57
|
case mode
|
66
58
|
when :mw_ml_template
|
67
59
|
scanner = StringScanner.new(line)
|
68
|
-
str= process_nested_structure(scanner, "{{", "}}") {""}
|
69
|
-
if
|
70
|
-
mode = nil
|
71
|
-
end
|
60
|
+
str = process_nested_structure(scanner, "{{", "}}") { "" }
|
61
|
+
mode = nil if ML_TEMPLATE_END_REGEX =~ str
|
72
62
|
@elements.last.last << line
|
73
63
|
next
|
74
64
|
when :mw_ml_link
|
75
65
|
scanner = StringScanner.new(line)
|
76
|
-
str= process_nested_structure(scanner, "[[", "]]") {""}
|
77
|
-
if
|
78
|
-
mode = nil
|
79
|
-
end
|
66
|
+
str = process_nested_structure(scanner, "[[", "]]") { "" }
|
67
|
+
mode = nil if ML_LINK_END_REGEX =~ str
|
80
68
|
@elements.last.last << line
|
81
69
|
next
|
82
70
|
when :mw_table
|
83
|
-
if
|
84
|
-
mode = nil
|
85
|
-
end
|
71
|
+
mode = nil if IN_TABLE_REGEX2 =~ line
|
86
72
|
@elements.last.last << line
|
87
|
-
next
|
73
|
+
next
|
88
74
|
when :mw_inputbox
|
89
|
-
if
|
90
|
-
mode = nil
|
91
|
-
end
|
75
|
+
mode = nil if IN_INPUTBOX_REGEX2 =~ line
|
92
76
|
@elements.last.last << line
|
93
77
|
next
|
94
78
|
when :mw_source
|
95
|
-
if
|
96
|
-
mode = nil
|
97
|
-
end
|
79
|
+
mode = nil if IN_SOURCE_REGEX2 =~ line
|
98
80
|
@elements.last.last << line
|
99
81
|
next
|
100
82
|
when :mw_math
|
101
|
-
if
|
102
|
-
mode = nil
|
103
|
-
end
|
83
|
+
mode = nil if IN_MATH_REGEX2 =~ line
|
104
84
|
@elements.last.last << line
|
105
85
|
next
|
106
86
|
when :mw_htable
|
107
|
-
if
|
108
|
-
mode = nil
|
109
|
-
end
|
87
|
+
mode = nil if IN_HTML_TABLE_REGEX2 =~ line
|
110
88
|
@elements.last.last << line
|
111
89
|
next
|
112
90
|
end
|
113
91
|
|
114
92
|
case line
|
115
|
-
when
|
93
|
+
when ISOLATED_TEMPLATE_REGEX
|
116
94
|
@elements << create_element(:mw_isolated_template, line)
|
117
|
-
when
|
95
|
+
when ISOLATED_TAG_REGEX
|
118
96
|
@elements << create_element(:mw_isolated_tag, line)
|
119
|
-
when
|
120
|
-
@elements << create_element(:mw_blank, "\n")
|
121
|
-
when
|
97
|
+
when BLANK_LINE_REGEX
|
98
|
+
@elements << create_element(:mw_blank, "\n")
|
99
|
+
when REDIRECT_REGEX
|
122
100
|
@elements << create_element(:mw_redirect, line)
|
123
|
-
when
|
124
|
-
line = line.sub(
|
101
|
+
when IN_HEADING_REGEX
|
102
|
+
line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
|
125
103
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
126
|
-
when
|
104
|
+
when IN_INPUTBOX_REGEX
|
127
105
|
@elements << create_element(:mw_inputbox, line)
|
128
|
-
when
|
106
|
+
when ML_TEMPLATE_ONSET_REGEX
|
129
107
|
@elements << create_element(:mw_ml_template, line)
|
130
108
|
mode = :mw_ml_template
|
131
|
-
when
|
109
|
+
when ML_LINK_ONSET_REGEX
|
132
110
|
@elements << create_element(:mw_ml_link, line)
|
133
111
|
mode = :mw_ml_link
|
134
|
-
when
|
112
|
+
when IN_INPUTBOX_REGEX1
|
135
113
|
mode = :mw_inputbox
|
136
114
|
@elements << create_element(:mw_inputbox, line)
|
137
|
-
when
|
138
|
-
|
139
|
-
when
|
115
|
+
when IN_SOURCE_REGEX
|
116
|
+
@elements << create_element(:mw_source, line)
|
117
|
+
when IN_SOURCE_REGEX1
|
140
118
|
mode = :mw_source
|
141
119
|
@elements << create_element(:mw_source, line)
|
142
|
-
when
|
120
|
+
when IN_MATH_REGEX
|
143
121
|
@elements << create_element(:mw_math, line)
|
144
|
-
when
|
122
|
+
when IN_MATH_REGEX1
|
145
123
|
mode = :mw_math
|
146
124
|
@elements << create_element(:mw_math, line)
|
147
|
-
when
|
125
|
+
when IN_HTML_TABLE_REGEX
|
148
126
|
@elements << create_element(:mw_htable, line)
|
149
|
-
when
|
127
|
+
when IN_HTML_TABLE_REGEX1
|
150
128
|
mode = :mw_htable
|
151
129
|
@elements << create_element(:mw_htable, line)
|
152
|
-
when
|
130
|
+
when IN_TABLE_REGEX1
|
153
131
|
mode = :mw_table
|
154
132
|
@elements << create_element(:mw_table, line)
|
155
|
-
when
|
156
|
-
line = line.sub(
|
133
|
+
when IN_UNORDERED_REGEX
|
134
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
157
135
|
@elements << create_element(:mw_unordered, line)
|
158
|
-
when
|
159
|
-
line = line.sub(
|
136
|
+
when IN_ORDERED_REGEX
|
137
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
160
138
|
@elements << create_element(:mw_ordered, line)
|
161
|
-
when
|
162
|
-
line = line.sub(
|
139
|
+
when IN_PRE_REGEX
|
140
|
+
line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
|
163
141
|
@elements << create_element(:mw_pre, line)
|
164
|
-
when
|
165
|
-
line = line.sub(
|
142
|
+
when IN_DEFINITION_REGEX
|
143
|
+
line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
|
166
144
|
@elements << create_element(:mw_definition, line)
|
167
|
-
when
|
145
|
+
when IN_LINK_REGEX
|
168
146
|
@elements << create_element(:mw_link, line)
|
169
|
-
else
|
147
|
+
else
|
170
148
|
@elements << create_element(:mw_paragraph, "\n" + line)
|
171
149
|
end
|
172
150
|
end
|