wp2txt 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +42 -13
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +172 -282
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -11
data/bin/wp2txt
CHANGED
@@ -1,197 +1,192 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
-
|
7
|
-
$DEBUG_MODE = false
|
8
|
-
SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
|
9
|
-
DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
|
10
|
-
|
11
|
-
require 'wp2txt'
|
12
|
-
require 'wp2txt/utils'
|
13
|
-
require 'wp2txt/version'
|
14
|
-
require 'etc'
|
15
|
-
require 'optimist'
|
16
|
-
require 'parallel'
|
17
|
-
require 'pastel'
|
18
|
-
require 'tty-spinner'
|
19
|
-
|
20
|
-
include Wp2txt
|
21
|
-
|
22
|
-
opts = Optimist::options do
|
23
|
-
version Wp2txt::VERSION
|
24
|
-
banner <<-EOS
|
25
|
-
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
26
|
-
|
27
|
-
Usage: wp2txt [options]
|
28
|
-
where [options] are:
|
29
|
-
EOS
|
30
|
-
|
31
|
-
opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", :required => true, :short => "-i"
|
32
|
-
opt :output_dir, "Path to output directory", :default => Dir::pwd, :type => String, :short => "-o"
|
33
|
-
opt :convert, "Output in plain text (converting from XML)", :default => true, :short => "-c"
|
34
|
-
opt :category, "Show article category information", :default => true, :short => "-a"
|
35
|
-
opt :category_only, "Extract only article title and categories", :default => false, :short => "-g"
|
36
|
-
opt :summary_only, "Extract only article title, categories, and summary text before first heading", :default => false, :short => "-s"
|
37
|
-
opt :file_size, "Approximate size (in MB) of each output file", :default => 10, :short => "-f"
|
38
|
-
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", :short => "-n"
|
39
|
-
opt :del_interfile, "Delete intermediate XML files from output dir", :short => "-x", :default => false
|
40
|
-
opt :title, "Keep page titles in output", :default => true, :short => "-t"
|
41
|
-
opt :heading, "Keep section titles in output", :default => true, :short => "-d"
|
42
|
-
opt :list, "Keep unprocessed list items in output", :default => false, :short => "-l"
|
43
|
-
opt :ref, "Keep reference notations in the format [ref]...[/ref]", :default => false, :short => "-r"
|
44
|
-
opt :redirect, "Show redirect destination", :default => false, :short => "-e"
|
45
|
-
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true, :short => "-m"
|
46
|
-
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", :default => false, :short => "-b"
|
47
|
-
end
|
48
|
-
|
49
|
-
Optimist::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
50
|
-
Optimist::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
51
|
-
|
52
|
-
pastel = Pastel.new
|
53
2
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
DEBUG_MODE = false
|
6
|
+
SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
|
7
|
+
DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
|
8
|
+
|
9
|
+
require_relative "../lib/wp2txt"
|
10
|
+
require_relative "../lib/wp2txt/utils"
|
11
|
+
require_relative "../lib/wp2txt/version"
|
12
|
+
|
13
|
+
require "etc"
|
14
|
+
require "optimist"
|
15
|
+
require "parallel"
|
16
|
+
require "pastel"
|
17
|
+
require "tty-spinner"
|
18
|
+
|
19
|
+
class WpApp
|
20
|
+
include Wp2txt
|
21
|
+
|
22
|
+
def run
|
23
|
+
opts = Optimist.options do
|
24
|
+
version VERSION
|
25
|
+
banner <<~BANNER
|
26
|
+
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
27
|
+
|
28
|
+
Usage: wp2txt [options]
|
29
|
+
where [options] are:
|
30
|
+
BANNER
|
31
|
+
|
32
|
+
opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", type: String, required: true, short: "-i"
|
33
|
+
opt :output_dir, "Path to output directory", default: Dir.pwd, type: String, short: "-o"
|
34
|
+
opt :convert, "Output in plain text (converting from XML)", default: true, short: "-c"
|
35
|
+
opt :category, "Show article category information", default: true, short: "-a"
|
36
|
+
opt :category_only, "Extract only article title and categories", default: false, short: "-g"
|
37
|
+
opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
|
38
|
+
opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
|
39
|
+
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
|
40
|
+
opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
|
41
|
+
opt :title, "Keep page titles in output", default: true, short: "-t"
|
42
|
+
opt :heading, "Keep section titles in output", default: true, short: "-d"
|
43
|
+
opt :list, "Keep unprocessed list items in output", default: false, short: "-l"
|
44
|
+
opt :ref, "Keep reference notations in the format [ref]...[/ref]", default: false, short: "-r"
|
45
|
+
opt :redirect, "Show redirect destination", default: false, short: "-e"
|
46
|
+
opt :marker, "Show symbols prefixed to list items, definitions, etc.", default: true, short: "-m"
|
47
|
+
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", default: false, short: "-b"
|
48
|
+
end
|
86
49
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
50
|
+
Optimist.die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
51
|
+
Optimist.die :input, "must exist" unless File.exist?(opts[:input])
|
52
|
+
Optimist.die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
53
|
+
|
54
|
+
pastel = Pastel.new
|
55
|
+
|
56
|
+
input_file = opts[:input]
|
57
|
+
output_dir = opts[:output_dir]
|
58
|
+
tfile_size = opts[:file_size]
|
59
|
+
num_processors = Etc.nprocessors
|
60
|
+
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
|
61
|
+
opts[:num_procs]
|
62
|
+
else
|
63
|
+
num_processors - 2
|
64
|
+
end
|
65
|
+
num_processes = 1 if num_processes < 1
|
66
|
+
|
67
|
+
convert = opts[:convert]
|
68
|
+
strip_tmarker = opts[:marker] ? false : true
|
69
|
+
opt_array = %i[title list heading table redirect multiline category category_only summary_only del_interfile bz2_gem]
|
70
|
+
|
71
|
+
config = {}
|
72
|
+
opt_array.each do |opt|
|
73
|
+
config[opt] = opts[opt]
|
74
|
+
end
|
107
75
|
|
108
|
-
|
109
|
-
|
110
|
-
puts "Number of files being processed: " + pastel.bold("#{input_files.size}")
|
111
|
-
puts "Number of CPU cores being used: " + pastel.bold("#{num_processes}")
|
112
|
-
|
113
|
-
Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |input_file|
|
114
|
-
wpconv = Wp2txt::Runner.new(input_file, output_dir, strip_tmarker, config[:del_interfile])
|
115
|
-
wpconv.extract_text do |article|
|
116
|
-
format_wiki!(article.title)
|
117
|
-
|
118
|
-
if config[:category_only]
|
119
|
-
title = "#{article.title}\t"
|
120
|
-
contents = article.categories.join(", ")
|
121
|
-
contents << "\n"
|
122
|
-
elsif config[:category] && !article.categories.empty?
|
123
|
-
title = "\n[[#{article.title}]]\n\n"
|
124
|
-
contents = "\nCATEGORIES: "
|
125
|
-
contents << article.categories.join(", ")
|
126
|
-
contents << "\n\n"
|
76
|
+
if File.ftype(input_file) == "directory"
|
77
|
+
input_files = Dir.glob("#{input_file}/*.xml")
|
127
78
|
else
|
128
|
-
|
129
|
-
|
79
|
+
puts ""
|
80
|
+
puts pastel.green.bold("Preprocessing")
|
81
|
+
puts "Decompressing and splitting the original dump file."
|
82
|
+
puts pastel.underline("This may take a while. Please be patient!")
|
83
|
+
|
84
|
+
time_start = Time.now.to_i
|
85
|
+
wpsplitter = Splitter.new(input_file, output_dir, tfile_size)
|
86
|
+
spinner = TTY::Spinner.new(":spinner", format: :arrow_pulse, hide_cursor: true, interval: 5)
|
87
|
+
spinner.auto_spin
|
88
|
+
wpsplitter.split_file
|
89
|
+
time_finish = Time.now.to_i
|
90
|
+
|
91
|
+
spinner.stop("Time: #{sec_to_str(time_finish - time_start)}") # Stop animation
|
92
|
+
puts pastel.blue.bold("Complete!")
|
93
|
+
exit unless convert
|
94
|
+
input_files = Dir.glob("#{output_dir}/*.xml")
|
130
95
|
end
|
131
96
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
line = e.last
|
152
|
-
line << "+PRE+" if $DEBUG_MODE
|
153
|
-
when :mw_quote
|
154
|
-
line = e.last
|
155
|
-
line << "+QUOTE+" if $DEBUG_MODE
|
156
|
-
when :mw_unordered, :mw_ordered, :mw_definition
|
157
|
-
next if !config[:list]
|
158
|
-
line = e.last
|
159
|
-
line << "+LIST+" if $DEBUG_MODE
|
160
|
-
when :mw_ml_template
|
161
|
-
next if !config[:multiline]
|
162
|
-
line = e.last
|
163
|
-
line << "+MLTEMPLATE+" if $DEBUG_MODE
|
164
|
-
when :mw_redirect
|
165
|
-
next if !config[:redirect]
|
166
|
-
line = e.last
|
167
|
-
line << "+REDIRECT+" if $DEBUG_MODE
|
168
|
-
line << "\n\n"
|
169
|
-
when :mw_isolated_template
|
170
|
-
next if !config[:multiline]
|
171
|
-
line = e.last
|
172
|
-
line << "+ISOLATED_TEMPLATE+" if $DEBUG_MODE
|
173
|
-
when :mw_isolated_tag
|
174
|
-
next
|
97
|
+
puts ""
|
98
|
+
puts pastel.red.bold("Converting")
|
99
|
+
puts "Number of files being processed: " + pastel.bold(input_files.size.to_s)
|
100
|
+
puts "Number of CPU cores being used: " + pastel.bold(num_processes.to_s)
|
101
|
+
|
102
|
+
Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |infile|
|
103
|
+
wpconv = Runner.new(infile, output_dir, strip_tmarker, config[:del_interfile])
|
104
|
+
wpconv.extract_text do |article|
|
105
|
+
article.title = format_wiki(article.title, config)
|
106
|
+
|
107
|
+
if config[:category_only]
|
108
|
+
title = "#{article.title}\t"
|
109
|
+
contents = article.categories.join(", ")
|
110
|
+
contents << "\n"
|
111
|
+
elsif config[:category] && !article.categories.empty?
|
112
|
+
title = "\n[[#{article.title}]]\n\n"
|
113
|
+
contents = +"\nCATEGORIES: "
|
114
|
+
contents << article.categories.join(", ")
|
115
|
+
contents << "\n\n"
|
175
116
|
else
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
117
|
+
title = "\n[[#{article.title}]]\n\n"
|
118
|
+
contents = +""
|
119
|
+
end
|
120
|
+
|
121
|
+
unless config[:category_only]
|
122
|
+
article.elements.each do |e|
|
123
|
+
case e.first
|
124
|
+
when :mw_heading
|
125
|
+
break if config[:summary_only]
|
126
|
+
next unless config[:heading]
|
127
|
+
|
128
|
+
e[-1] = format_wiki(e.last, config)
|
129
|
+
line = e.last
|
130
|
+
line << "+HEADING+" if DEBUG_MODE
|
131
|
+
when :mw_paragraph
|
132
|
+
e[-1] = format_wiki(e.last, config)
|
133
|
+
line = e.last + "\n"
|
134
|
+
line << "+PARAGRAPH+" if DEBUG_MODE
|
135
|
+
when :mw_table, :mw_htable
|
136
|
+
next unless config[:table]
|
137
|
+
|
138
|
+
line = e.last
|
139
|
+
line << "+TABLE+" if DEBUG_MODE
|
140
|
+
when :mw_pre
|
141
|
+
next unless config[:pre]
|
142
|
+
|
143
|
+
line = e.last
|
144
|
+
line << "+PRE+" if DEBUG_MODE
|
145
|
+
when :mw_quote
|
146
|
+
line = e.last
|
147
|
+
line << "+QUOTE+" if DEBUG_MODE
|
148
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
149
|
+
next unless config[:list]
|
150
|
+
|
151
|
+
line = e.last
|
152
|
+
line << "+LIST+" if DEBUG_MODE
|
153
|
+
when :mw_ml_template
|
154
|
+
next unless config[:multiline]
|
155
|
+
|
156
|
+
line = e.last
|
157
|
+
line << "+MLTEMPLATE+" if DEBUG_MODE
|
158
|
+
when :mw_redirect
|
159
|
+
next unless config[:redirect]
|
160
|
+
|
161
|
+
line = e.last
|
162
|
+
line << "+REDIRECT+" if DEBUG_MODE
|
163
|
+
line << "\n\n"
|
164
|
+
when :mw_isolated_template
|
165
|
+
next unless config[:multiline]
|
166
|
+
|
167
|
+
line = e.last
|
168
|
+
line << "+ISOLATED_TEMPLATE+" if DEBUG_MODE
|
169
|
+
when :mw_isolated_tag
|
170
|
+
next
|
171
|
+
else
|
172
|
+
next unless DEBUG_MODE
|
173
|
+
|
174
|
+
line = e.last
|
175
|
+
line << "+OTHER+"
|
176
|
+
end
|
177
|
+
contents << line << "\n"
|
182
178
|
end
|
183
179
|
end
|
184
|
-
contents << line << "\n"
|
185
|
-
end
|
186
|
-
end
|
187
180
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
181
|
+
if /\A[\s ]*\z/m =~ contents
|
182
|
+
""
|
183
|
+
else
|
184
|
+
config[:title] ? title << contents : contents
|
185
|
+
end
|
186
|
+
end
|
192
187
|
end
|
188
|
+
puts pastel.blue.bold("Complete!")
|
193
189
|
end
|
194
190
|
end
|
195
191
|
|
196
|
-
|
197
|
-
|
192
|
+
WpApp.new.run
|
data/lib/wp2txt/article.rb
CHANGED
@@ -1,62 +1,54 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
|
1
|
+
# frozen_string_literal: true
|
6
2
|
|
7
3
|
require 'strscan'
|
8
|
-
|
4
|
+
require_relative 'utils'
|
9
5
|
|
10
6
|
module Wp2txt
|
11
|
-
|
12
7
|
# possible element type, which could be later chosen to print or not to print
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
8
|
+
# :mw_heading
|
9
|
+
# :mw_htable
|
10
|
+
# :mw_quote
|
11
|
+
# :mw_unordered
|
12
|
+
# :mw_ordered
|
13
|
+
# :mw_definition
|
14
|
+
# :mw_pre
|
15
|
+
# :mw_paragraph
|
16
|
+
# :mw_comment
|
17
|
+
# :mw_math
|
18
|
+
# :mw_source
|
19
|
+
# :mw_inputbox
|
20
|
+
# :mw_template
|
21
|
+
# :mw_link
|
22
|
+
# :mw_summary
|
23
|
+
# :mw_blank
|
24
|
+
# :mw_redirect
|
30
25
|
|
31
26
|
# an article contains elements, each of which is [TYPE, string]
|
32
27
|
class Article
|
33
|
-
|
34
28
|
include Wp2txt
|
35
29
|
attr_accessor :elements, :title, :categories
|
36
|
-
|
30
|
+
|
37
31
|
def initialize(text, title = "", strip_tmarker = false)
|
38
32
|
@title = title.strip
|
39
33
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters
|
41
|
-
text.gsub
|
42
|
-
remove_html
|
43
|
-
make_reference
|
44
|
-
remove_ref
|
34
|
+
text = convert_characters(text)
|
35
|
+
text = text.gsub(/\|\n\n+/m) { "|\n" }
|
36
|
+
text = remove_html(text)
|
37
|
+
text = make_reference(text)
|
38
|
+
text = remove_ref(text)
|
45
39
|
parse text
|
46
40
|
end
|
47
|
-
|
48
|
-
def create_element(
|
49
|
-
[
|
41
|
+
|
42
|
+
def create_element(tpx, text)
|
43
|
+
[tpx, text]
|
50
44
|
end
|
51
|
-
|
45
|
+
|
52
46
|
def parse(source)
|
53
47
|
@elements = []
|
54
|
-
@categories
|
48
|
+
@categories = []
|
55
49
|
mode = nil
|
56
|
-
open_stack = []
|
57
|
-
close_stack = []
|
58
50
|
source.each_line do |line|
|
59
|
-
matched = line.scan(
|
51
|
+
matched = line.scan(CATEGORY_REGEX)
|
60
52
|
if matched && !matched.empty?
|
61
53
|
@categories += matched
|
62
54
|
@categories.uniq!
|
@@ -65,108 +57,94 @@ module Wp2txt
|
|
65
57
|
case mode
|
66
58
|
when :mw_ml_template
|
67
59
|
scanner = StringScanner.new(line)
|
68
|
-
str= process_nested_structure(scanner, "{{", "}}") {""}
|
69
|
-
if
|
70
|
-
mode = nil
|
71
|
-
end
|
60
|
+
str = process_nested_structure(scanner, "{{", "}}") { "" }
|
61
|
+
mode = nil if ML_TEMPLATE_END_REGEX =~ str
|
72
62
|
@elements.last.last << line
|
73
63
|
next
|
74
64
|
when :mw_ml_link
|
75
65
|
scanner = StringScanner.new(line)
|
76
|
-
str= process_nested_structure(scanner, "[[", "]]") {""}
|
77
|
-
if
|
78
|
-
mode = nil
|
79
|
-
end
|
66
|
+
str = process_nested_structure(scanner, "[[", "]]") { "" }
|
67
|
+
mode = nil if ML_LINK_END_REGEX =~ str
|
80
68
|
@elements.last.last << line
|
81
69
|
next
|
82
70
|
when :mw_table
|
83
|
-
if
|
84
|
-
mode = nil
|
85
|
-
end
|
71
|
+
mode = nil if IN_TABLE_REGEX2 =~ line
|
86
72
|
@elements.last.last << line
|
87
|
-
next
|
73
|
+
next
|
88
74
|
when :mw_inputbox
|
89
|
-
if
|
90
|
-
mode = nil
|
91
|
-
end
|
75
|
+
mode = nil if IN_INPUTBOX_REGEX2 =~ line
|
92
76
|
@elements.last.last << line
|
93
77
|
next
|
94
78
|
when :mw_source
|
95
|
-
if
|
96
|
-
mode = nil
|
97
|
-
end
|
79
|
+
mode = nil if IN_SOURCE_REGEX2 =~ line
|
98
80
|
@elements.last.last << line
|
99
81
|
next
|
100
82
|
when :mw_math
|
101
|
-
if
|
102
|
-
mode = nil
|
103
|
-
end
|
83
|
+
mode = nil if IN_MATH_REGEX2 =~ line
|
104
84
|
@elements.last.last << line
|
105
85
|
next
|
106
86
|
when :mw_htable
|
107
|
-
if
|
108
|
-
mode = nil
|
109
|
-
end
|
87
|
+
mode = nil if IN_HTML_TABLE_REGEX2 =~ line
|
110
88
|
@elements.last.last << line
|
111
89
|
next
|
112
90
|
end
|
113
91
|
|
114
92
|
case line
|
115
|
-
when
|
93
|
+
when ISOLATED_TEMPLATE_REGEX
|
116
94
|
@elements << create_element(:mw_isolated_template, line)
|
117
|
-
when
|
95
|
+
when ISOLATED_TAG_REGEX
|
118
96
|
@elements << create_element(:mw_isolated_tag, line)
|
119
|
-
when
|
120
|
-
@elements << create_element(:mw_blank, "\n")
|
121
|
-
when
|
97
|
+
when BLANK_LINE_REGEX
|
98
|
+
@elements << create_element(:mw_blank, "\n")
|
99
|
+
when REDIRECT_REGEX
|
122
100
|
@elements << create_element(:mw_redirect, line)
|
123
|
-
when
|
124
|
-
line = line.sub(
|
101
|
+
when IN_HEADING_REGEX
|
102
|
+
line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
|
125
103
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
126
|
-
when
|
104
|
+
when IN_INPUTBOX_REGEX
|
127
105
|
@elements << create_element(:mw_inputbox, line)
|
128
|
-
when
|
106
|
+
when ML_TEMPLATE_ONSET_REGEX
|
129
107
|
@elements << create_element(:mw_ml_template, line)
|
130
108
|
mode = :mw_ml_template
|
131
|
-
when
|
109
|
+
when ML_LINK_ONSET_REGEX
|
132
110
|
@elements << create_element(:mw_ml_link, line)
|
133
111
|
mode = :mw_ml_link
|
134
|
-
when
|
112
|
+
when IN_INPUTBOX_REGEX1
|
135
113
|
mode = :mw_inputbox
|
136
114
|
@elements << create_element(:mw_inputbox, line)
|
137
|
-
when
|
138
|
-
|
139
|
-
when
|
115
|
+
when IN_SOURCE_REGEX
|
116
|
+
@elements << create_element(:mw_source, line)
|
117
|
+
when IN_SOURCE_REGEX1
|
140
118
|
mode = :mw_source
|
141
119
|
@elements << create_element(:mw_source, line)
|
142
|
-
when
|
120
|
+
when IN_MATH_REGEX
|
143
121
|
@elements << create_element(:mw_math, line)
|
144
|
-
when
|
122
|
+
when IN_MATH_REGEX1
|
145
123
|
mode = :mw_math
|
146
124
|
@elements << create_element(:mw_math, line)
|
147
|
-
when
|
125
|
+
when IN_HTML_TABLE_REGEX
|
148
126
|
@elements << create_element(:mw_htable, line)
|
149
|
-
when
|
127
|
+
when IN_HTML_TABLE_REGEX1
|
150
128
|
mode = :mw_htable
|
151
129
|
@elements << create_element(:mw_htable, line)
|
152
|
-
when
|
130
|
+
when IN_TABLE_REGEX1
|
153
131
|
mode = :mw_table
|
154
132
|
@elements << create_element(:mw_table, line)
|
155
|
-
when
|
156
|
-
line = line.sub(
|
133
|
+
when IN_UNORDERED_REGEX
|
134
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
157
135
|
@elements << create_element(:mw_unordered, line)
|
158
|
-
when
|
159
|
-
line = line.sub(
|
136
|
+
when IN_ORDERED_REGEX
|
137
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
160
138
|
@elements << create_element(:mw_ordered, line)
|
161
|
-
when
|
162
|
-
line = line.sub(
|
139
|
+
when IN_PRE_REGEX
|
140
|
+
line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
|
163
141
|
@elements << create_element(:mw_pre, line)
|
164
|
-
when
|
165
|
-
line = line.sub(
|
142
|
+
when IN_DEFINITION_REGEX
|
143
|
+
line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
|
166
144
|
@elements << create_element(:mw_definition, line)
|
167
|
-
when
|
145
|
+
when IN_LINK_REGEX
|
168
146
|
@elements << create_element(:mw_link, line)
|
169
|
-
else
|
147
|
+
else
|
170
148
|
@elements << create_element(:mw_paragraph, "\n" + line)
|
171
149
|
end
|
172
150
|
end
|