wp2txt 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +26 -3
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/lib/wp2txt/article.rb
CHANGED
@@ -1,62 +1,54 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
|
1
|
+
# frozen_string_literal: true
|
6
2
|
|
7
3
|
require 'strscan'
|
8
|
-
|
4
|
+
require_relative 'utils'
|
9
5
|
|
10
6
|
module Wp2txt
|
11
|
-
|
12
7
|
# possible element type, which could be later chosen to print or not to print
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
8
|
+
# :mw_heading
|
9
|
+
# :mw_htable
|
10
|
+
# :mw_quote
|
11
|
+
# :mw_unordered
|
12
|
+
# :mw_ordered
|
13
|
+
# :mw_definition
|
14
|
+
# :mw_pre
|
15
|
+
# :mw_paragraph
|
16
|
+
# :mw_comment
|
17
|
+
# :mw_math
|
18
|
+
# :mw_source
|
19
|
+
# :mw_inputbox
|
20
|
+
# :mw_template
|
21
|
+
# :mw_link
|
22
|
+
# :mw_summary
|
23
|
+
# :mw_blank
|
24
|
+
# :mw_redirect
|
30
25
|
|
31
26
|
# an article contains elements, each of which is [TYPE, string]
|
32
27
|
class Article
|
33
|
-
|
34
28
|
include Wp2txt
|
35
29
|
attr_accessor :elements, :title, :categories
|
36
|
-
|
30
|
+
|
37
31
|
def initialize(text, title = "", strip_tmarker = false)
|
38
32
|
@title = title.strip
|
39
33
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters
|
41
|
-
text.gsub
|
42
|
-
remove_html
|
43
|
-
make_reference
|
44
|
-
remove_ref
|
34
|
+
text = convert_characters(text)
|
35
|
+
text = text.gsub(/\|\n\n+/m) { "|\n" }
|
36
|
+
text = remove_html(text)
|
37
|
+
text = make_reference(text)
|
38
|
+
text = remove_ref(text)
|
45
39
|
parse text
|
46
40
|
end
|
47
|
-
|
48
|
-
def create_element(
|
49
|
-
[
|
41
|
+
|
42
|
+
def create_element(tpx, text)
|
43
|
+
[tpx, text]
|
50
44
|
end
|
51
|
-
|
45
|
+
|
52
46
|
def parse(source)
|
53
47
|
@elements = []
|
54
|
-
@categories
|
48
|
+
@categories = []
|
55
49
|
mode = nil
|
56
|
-
open_stack = []
|
57
|
-
close_stack = []
|
58
50
|
source.each_line do |line|
|
59
|
-
matched = line.scan(
|
51
|
+
matched = line.scan(CATEGORY_REGEX)
|
60
52
|
if matched && !matched.empty?
|
61
53
|
@categories += matched
|
62
54
|
@categories.uniq!
|
@@ -65,108 +57,94 @@ module Wp2txt
|
|
65
57
|
case mode
|
66
58
|
when :mw_ml_template
|
67
59
|
scanner = StringScanner.new(line)
|
68
|
-
str= process_nested_structure(scanner, "{{", "}}") {""}
|
69
|
-
if
|
70
|
-
mode = nil
|
71
|
-
end
|
60
|
+
str = process_nested_structure(scanner, "{{", "}}") { "" }
|
61
|
+
mode = nil if ML_TEMPLATE_END_REGEX =~ str
|
72
62
|
@elements.last.last << line
|
73
63
|
next
|
74
64
|
when :mw_ml_link
|
75
65
|
scanner = StringScanner.new(line)
|
76
|
-
str= process_nested_structure(scanner, "[[", "]]") {""}
|
77
|
-
if
|
78
|
-
mode = nil
|
79
|
-
end
|
66
|
+
str = process_nested_structure(scanner, "[[", "]]") { "" }
|
67
|
+
mode = nil if ML_LINK_END_REGEX =~ str
|
80
68
|
@elements.last.last << line
|
81
69
|
next
|
82
70
|
when :mw_table
|
83
|
-
if
|
84
|
-
mode = nil
|
85
|
-
end
|
71
|
+
mode = nil if IN_TABLE_REGEX2 =~ line
|
86
72
|
@elements.last.last << line
|
87
|
-
next
|
73
|
+
next
|
88
74
|
when :mw_inputbox
|
89
|
-
if
|
90
|
-
mode = nil
|
91
|
-
end
|
75
|
+
mode = nil if IN_INPUTBOX_REGEX2 =~ line
|
92
76
|
@elements.last.last << line
|
93
77
|
next
|
94
78
|
when :mw_source
|
95
|
-
if
|
96
|
-
mode = nil
|
97
|
-
end
|
79
|
+
mode = nil if IN_SOURCE_REGEX2 =~ line
|
98
80
|
@elements.last.last << line
|
99
81
|
next
|
100
82
|
when :mw_math
|
101
|
-
if
|
102
|
-
mode = nil
|
103
|
-
end
|
83
|
+
mode = nil if IN_MATH_REGEX2 =~ line
|
104
84
|
@elements.last.last << line
|
105
85
|
next
|
106
86
|
when :mw_htable
|
107
|
-
if
|
108
|
-
mode = nil
|
109
|
-
end
|
87
|
+
mode = nil if IN_HTML_TABLE_REGEX2 =~ line
|
110
88
|
@elements.last.last << line
|
111
89
|
next
|
112
90
|
end
|
113
91
|
|
114
92
|
case line
|
115
|
-
when
|
93
|
+
when ISOLATED_TEMPLATE_REGEX
|
116
94
|
@elements << create_element(:mw_isolated_template, line)
|
117
|
-
when
|
95
|
+
when ISOLATED_TAG_REGEX
|
118
96
|
@elements << create_element(:mw_isolated_tag, line)
|
119
|
-
when
|
120
|
-
@elements << create_element(:mw_blank, "\n")
|
121
|
-
when
|
97
|
+
when BLANK_LINE_REGEX
|
98
|
+
@elements << create_element(:mw_blank, "\n")
|
99
|
+
when REDIRECT_REGEX
|
122
100
|
@elements << create_element(:mw_redirect, line)
|
123
|
-
when
|
124
|
-
line = line.sub(
|
101
|
+
when IN_HEADING_REGEX
|
102
|
+
line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
|
125
103
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
126
|
-
when
|
104
|
+
when IN_INPUTBOX_REGEX
|
127
105
|
@elements << create_element(:mw_inputbox, line)
|
128
|
-
when
|
106
|
+
when ML_TEMPLATE_ONSET_REGEX
|
129
107
|
@elements << create_element(:mw_ml_template, line)
|
130
108
|
mode = :mw_ml_template
|
131
|
-
when
|
109
|
+
when ML_LINK_ONSET_REGEX
|
132
110
|
@elements << create_element(:mw_ml_link, line)
|
133
111
|
mode = :mw_ml_link
|
134
|
-
when
|
112
|
+
when IN_INPUTBOX_REGEX1
|
135
113
|
mode = :mw_inputbox
|
136
114
|
@elements << create_element(:mw_inputbox, line)
|
137
|
-
when
|
138
|
-
|
139
|
-
when
|
115
|
+
when IN_SOURCE_REGEX
|
116
|
+
@elements << create_element(:mw_source, line)
|
117
|
+
when IN_SOURCE_REGEX1
|
140
118
|
mode = :mw_source
|
141
119
|
@elements << create_element(:mw_source, line)
|
142
|
-
when
|
120
|
+
when IN_MATH_REGEX
|
143
121
|
@elements << create_element(:mw_math, line)
|
144
|
-
when
|
122
|
+
when IN_MATH_REGEX1
|
145
123
|
mode = :mw_math
|
146
124
|
@elements << create_element(:mw_math, line)
|
147
|
-
when
|
125
|
+
when IN_HTML_TABLE_REGEX
|
148
126
|
@elements << create_element(:mw_htable, line)
|
149
|
-
when
|
127
|
+
when IN_HTML_TABLE_REGEX1
|
150
128
|
mode = :mw_htable
|
151
129
|
@elements << create_element(:mw_htable, line)
|
152
|
-
when
|
130
|
+
when IN_TABLE_REGEX1
|
153
131
|
mode = :mw_table
|
154
132
|
@elements << create_element(:mw_table, line)
|
155
|
-
when
|
156
|
-
line = line.sub(
|
133
|
+
when IN_UNORDERED_REGEX
|
134
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
157
135
|
@elements << create_element(:mw_unordered, line)
|
158
|
-
when
|
159
|
-
line = line.sub(
|
136
|
+
when IN_ORDERED_REGEX
|
137
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
160
138
|
@elements << create_element(:mw_ordered, line)
|
161
|
-
when
|
162
|
-
line = line.sub(
|
139
|
+
when IN_PRE_REGEX
|
140
|
+
line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
|
163
141
|
@elements << create_element(:mw_pre, line)
|
164
|
-
when
|
165
|
-
line = line.sub(
|
142
|
+
when IN_DEFINITION_REGEX
|
143
|
+
line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
|
166
144
|
@elements << create_element(:mw_definition, line)
|
167
|
-
when
|
145
|
+
when IN_LINK_REGEX
|
168
146
|
@elements << create_element(:mw_link, line)
|
169
|
-
else
|
147
|
+
else
|
170
148
|
@elements << create_element(:mw_paragraph, "\n" + line)
|
171
149
|
end
|
172
150
|
end
|
data/lib/wp2txt/regex.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "htmlentities"
|
4
|
+
|
5
|
+
module Wp2txt
|
6
|
+
###################################################
|
7
|
+
# variables to save resource for generating regexps
|
8
|
+
# those with a trailing number 1 represent opening tag/markup
|
9
|
+
# those with a trailing number 2 represent closing tag/markup
|
10
|
+
# those without a trailing number contain both opening/closing tags/markups
|
11
|
+
|
12
|
+
HTML_DECODER = HTMLEntities.new
|
13
|
+
|
14
|
+
ENTITIES = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
15
|
+
HTML_HASH = Hash[*ENTITIES.flatten]
|
16
|
+
HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
|
17
|
+
ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
|
18
|
+
ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
|
19
|
+
ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
|
20
|
+
ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
|
21
|
+
ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
22
|
+
ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
23
|
+
IN_LINK_REGEX = Regexp.new('^\s*\[.*\]\s*$')
|
24
|
+
IN_INPUTBOX_REGEX = Regexp.new('<inputbox>.*?<\/inputbox>')
|
25
|
+
IN_INPUTBOX_REGEX1 = Regexp.new('<inputbox>')
|
26
|
+
IN_INPUTBOX_REGEX2 = Regexp.new('<\/inputbox>')
|
27
|
+
IN_SOURCE_REGEX = Regexp.new('<source.*?>.*?<\/source>')
|
28
|
+
IN_SOURCE_REGEX1 = Regexp.new('<source.*?>')
|
29
|
+
IN_SOURCE_REGEX2 = Regexp.new('<\/source>')
|
30
|
+
IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
|
31
|
+
IN_MATH_REGEX1 = Regexp.new('<math.*?>')
|
32
|
+
IN_MATH_REGEX2 = Regexp.new('<\/math>')
|
33
|
+
IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
|
34
|
+
IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
|
35
|
+
IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
|
36
|
+
IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
|
37
|
+
IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
|
38
|
+
IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
|
39
|
+
IN_UNORDERED_REGEX = Regexp.new('^\*')
|
40
|
+
IN_ORDERED_REGEX = Regexp.new('^\#')
|
41
|
+
IN_PRE_REGEX = Regexp.new('^ ')
|
42
|
+
IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
|
43
|
+
BLANK_LINE_REGEX = Regexp.new('^\s*$')
|
44
|
+
REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
45
|
+
REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
|
46
|
+
REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
|
47
|
+
REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
48
|
+
CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
49
|
+
MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
|
50
|
+
REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
|
51
|
+
MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
|
52
|
+
MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
|
53
|
+
MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
|
54
|
+
MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
|
55
|
+
FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
56
|
+
HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
|
57
|
+
HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
|
58
|
+
LIST_MARKS_REGEX = Regexp.new('\A[\*\#\;\:\ ]+')
|
59
|
+
PRE_MARKS_REGEX = Regexp.new('\A\^\ ')
|
60
|
+
DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
|
61
|
+
ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
|
62
|
+
|
63
|
+
CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
|
64
|
+
CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
65
|
+
|
66
|
+
ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
67
|
+
UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
|
68
|
+
|
69
|
+
REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
70
|
+
REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
|
71
|
+
TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
72
|
+
|
73
|
+
SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
|
74
|
+
DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
|
75
|
+
SINGLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
|
76
|
+
DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
|
77
|
+
CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
|
78
|
+
|
79
|
+
COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
80
|
+
COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
81
|
+
COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
82
|
+
COMPLEX_REGEX_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
+
COMPLEX_REGEX_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
84
|
+
|
85
|
+
CLEANUP_REGEX_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
86
|
+
CLEANUP_REGEX_02 = Regexp.new('^File:.+$')
|
87
|
+
CLEANUP_REGEX_03 = Regexp.new('^\|.*$')
|
88
|
+
CLEANUP_REGEX_04 = Regexp.new('\{\{.*$')
|
89
|
+
CLEANUP_REGEX_05 = Regexp.new('^.*\}\}')
|
90
|
+
CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
|
91
|
+
CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
|
92
|
+
CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
93
|
+
end
|