wp2txt 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +26 -3
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/lib/wp2txt/article.rb
CHANGED
@@ -1,62 +1,54 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
|
1
|
+
# frozen_string_literal: true
|
6
2
|
|
7
3
|
require 'strscan'
|
8
|
-
|
4
|
+
require_relative 'utils'
|
9
5
|
|
10
6
|
module Wp2txt
|
11
|
-
|
12
7
|
# possible element type, which could be later chosen to print or not to print
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
8
|
+
# :mw_heading
|
9
|
+
# :mw_htable
|
10
|
+
# :mw_quote
|
11
|
+
# :mw_unordered
|
12
|
+
# :mw_ordered
|
13
|
+
# :mw_definition
|
14
|
+
# :mw_pre
|
15
|
+
# :mw_paragraph
|
16
|
+
# :mw_comment
|
17
|
+
# :mw_math
|
18
|
+
# :mw_source
|
19
|
+
# :mw_inputbox
|
20
|
+
# :mw_template
|
21
|
+
# :mw_link
|
22
|
+
# :mw_summary
|
23
|
+
# :mw_blank
|
24
|
+
# :mw_redirect
|
30
25
|
|
31
26
|
# an article contains elements, each of which is [TYPE, string]
|
32
27
|
class Article
|
33
|
-
|
34
28
|
include Wp2txt
|
35
29
|
attr_accessor :elements, :title, :categories
|
36
|
-
|
30
|
+
|
37
31
|
def initialize(text, title = "", strip_tmarker = false)
|
38
32
|
@title = title.strip
|
39
33
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters
|
41
|
-
text.gsub
|
42
|
-
remove_html
|
43
|
-
make_reference
|
44
|
-
remove_ref
|
34
|
+
text = convert_characters(text)
|
35
|
+
text = text.gsub(/\|\n\n+/m) { "|\n" }
|
36
|
+
text = remove_html(text)
|
37
|
+
text = make_reference(text)
|
38
|
+
text = remove_ref(text)
|
45
39
|
parse text
|
46
40
|
end
|
47
|
-
|
48
|
-
def create_element(
|
49
|
-
[
|
41
|
+
|
42
|
+
def create_element(tpx, text)
|
43
|
+
[tpx, text]
|
50
44
|
end
|
51
|
-
|
45
|
+
|
52
46
|
def parse(source)
|
53
47
|
@elements = []
|
54
|
-
@categories
|
48
|
+
@categories = []
|
55
49
|
mode = nil
|
56
|
-
open_stack = []
|
57
|
-
close_stack = []
|
58
50
|
source.each_line do |line|
|
59
|
-
matched = line.scan(
|
51
|
+
matched = line.scan(CATEGORY_REGEX)
|
60
52
|
if matched && !matched.empty?
|
61
53
|
@categories += matched
|
62
54
|
@categories.uniq!
|
@@ -65,108 +57,94 @@ module Wp2txt
|
|
65
57
|
case mode
|
66
58
|
when :mw_ml_template
|
67
59
|
scanner = StringScanner.new(line)
|
68
|
-
str= process_nested_structure(scanner, "{{", "}}") {""}
|
69
|
-
if
|
70
|
-
mode = nil
|
71
|
-
end
|
60
|
+
str = process_nested_structure(scanner, "{{", "}}") { "" }
|
61
|
+
mode = nil if ML_TEMPLATE_END_REGEX =~ str
|
72
62
|
@elements.last.last << line
|
73
63
|
next
|
74
64
|
when :mw_ml_link
|
75
65
|
scanner = StringScanner.new(line)
|
76
|
-
str= process_nested_structure(scanner, "[[", "]]") {""}
|
77
|
-
if
|
78
|
-
mode = nil
|
79
|
-
end
|
66
|
+
str = process_nested_structure(scanner, "[[", "]]") { "" }
|
67
|
+
mode = nil if ML_LINK_END_REGEX =~ str
|
80
68
|
@elements.last.last << line
|
81
69
|
next
|
82
70
|
when :mw_table
|
83
|
-
if
|
84
|
-
mode = nil
|
85
|
-
end
|
71
|
+
mode = nil if IN_TABLE_REGEX2 =~ line
|
86
72
|
@elements.last.last << line
|
87
|
-
next
|
73
|
+
next
|
88
74
|
when :mw_inputbox
|
89
|
-
if
|
90
|
-
mode = nil
|
91
|
-
end
|
75
|
+
mode = nil if IN_INPUTBOX_REGEX2 =~ line
|
92
76
|
@elements.last.last << line
|
93
77
|
next
|
94
78
|
when :mw_source
|
95
|
-
if
|
96
|
-
mode = nil
|
97
|
-
end
|
79
|
+
mode = nil if IN_SOURCE_REGEX2 =~ line
|
98
80
|
@elements.last.last << line
|
99
81
|
next
|
100
82
|
when :mw_math
|
101
|
-
if
|
102
|
-
mode = nil
|
103
|
-
end
|
83
|
+
mode = nil if IN_MATH_REGEX2 =~ line
|
104
84
|
@elements.last.last << line
|
105
85
|
next
|
106
86
|
when :mw_htable
|
107
|
-
if
|
108
|
-
mode = nil
|
109
|
-
end
|
87
|
+
mode = nil if IN_HTML_TABLE_REGEX2 =~ line
|
110
88
|
@elements.last.last << line
|
111
89
|
next
|
112
90
|
end
|
113
91
|
|
114
92
|
case line
|
115
|
-
when
|
93
|
+
when ISOLATED_TEMPLATE_REGEX
|
116
94
|
@elements << create_element(:mw_isolated_template, line)
|
117
|
-
when
|
95
|
+
when ISOLATED_TAG_REGEX
|
118
96
|
@elements << create_element(:mw_isolated_tag, line)
|
119
|
-
when
|
120
|
-
@elements << create_element(:mw_blank, "\n")
|
121
|
-
when
|
97
|
+
when BLANK_LINE_REGEX
|
98
|
+
@elements << create_element(:mw_blank, "\n")
|
99
|
+
when REDIRECT_REGEX
|
122
100
|
@elements << create_element(:mw_redirect, line)
|
123
|
-
when
|
124
|
-
line = line.sub(
|
101
|
+
when IN_HEADING_REGEX
|
102
|
+
line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
|
125
103
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
126
|
-
when
|
104
|
+
when IN_INPUTBOX_REGEX
|
127
105
|
@elements << create_element(:mw_inputbox, line)
|
128
|
-
when
|
106
|
+
when ML_TEMPLATE_ONSET_REGEX
|
129
107
|
@elements << create_element(:mw_ml_template, line)
|
130
108
|
mode = :mw_ml_template
|
131
|
-
when
|
109
|
+
when ML_LINK_ONSET_REGEX
|
132
110
|
@elements << create_element(:mw_ml_link, line)
|
133
111
|
mode = :mw_ml_link
|
134
|
-
when
|
112
|
+
when IN_INPUTBOX_REGEX1
|
135
113
|
mode = :mw_inputbox
|
136
114
|
@elements << create_element(:mw_inputbox, line)
|
137
|
-
when
|
138
|
-
|
139
|
-
when
|
115
|
+
when IN_SOURCE_REGEX
|
116
|
+
@elements << create_element(:mw_source, line)
|
117
|
+
when IN_SOURCE_REGEX1
|
140
118
|
mode = :mw_source
|
141
119
|
@elements << create_element(:mw_source, line)
|
142
|
-
when
|
120
|
+
when IN_MATH_REGEX
|
143
121
|
@elements << create_element(:mw_math, line)
|
144
|
-
when
|
122
|
+
when IN_MATH_REGEX1
|
145
123
|
mode = :mw_math
|
146
124
|
@elements << create_element(:mw_math, line)
|
147
|
-
when
|
125
|
+
when IN_HTML_TABLE_REGEX
|
148
126
|
@elements << create_element(:mw_htable, line)
|
149
|
-
when
|
127
|
+
when IN_HTML_TABLE_REGEX1
|
150
128
|
mode = :mw_htable
|
151
129
|
@elements << create_element(:mw_htable, line)
|
152
|
-
when
|
130
|
+
when IN_TABLE_REGEX1
|
153
131
|
mode = :mw_table
|
154
132
|
@elements << create_element(:mw_table, line)
|
155
|
-
when
|
156
|
-
line = line.sub(
|
133
|
+
when IN_UNORDERED_REGEX
|
134
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
157
135
|
@elements << create_element(:mw_unordered, line)
|
158
|
-
when
|
159
|
-
line = line.sub(
|
136
|
+
when IN_ORDERED_REGEX
|
137
|
+
line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
|
160
138
|
@elements << create_element(:mw_ordered, line)
|
161
|
-
when
|
162
|
-
line = line.sub(
|
139
|
+
when IN_PRE_REGEX
|
140
|
+
line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
|
163
141
|
@elements << create_element(:mw_pre, line)
|
164
|
-
when
|
165
|
-
line = line.sub(
|
142
|
+
when IN_DEFINITION_REGEX
|
143
|
+
line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
|
166
144
|
@elements << create_element(:mw_definition, line)
|
167
|
-
when
|
145
|
+
when IN_LINK_REGEX
|
168
146
|
@elements << create_element(:mw_link, line)
|
169
|
-
else
|
147
|
+
else
|
170
148
|
@elements << create_element(:mw_paragraph, "\n" + line)
|
171
149
|
end
|
172
150
|
end
|
data/lib/wp2txt/regex.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "htmlentities"
|
4
|
+
|
5
|
+
module Wp2txt
|
6
|
+
###################################################
|
7
|
+
# variables to save resource for generating regexps
|
8
|
+
# those with a trailing number 1 represent opening tag/markup
|
9
|
+
# those with a trailing number 2 represent closing tag/markup
|
10
|
+
# those without a trailing number contain both opening/closing tags/markups
|
11
|
+
|
12
|
+
HTML_DECODER = HTMLEntities.new
|
13
|
+
|
14
|
+
ENTITIES = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
15
|
+
HTML_HASH = Hash[*ENTITIES.flatten]
|
16
|
+
HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
|
17
|
+
ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
|
18
|
+
ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
|
19
|
+
ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
|
20
|
+
ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
|
21
|
+
ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
22
|
+
ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
23
|
+
IN_LINK_REGEX = Regexp.new('^\s*\[.*\]\s*$')
|
24
|
+
IN_INPUTBOX_REGEX = Regexp.new('<inputbox>.*?<\/inputbox>')
|
25
|
+
IN_INPUTBOX_REGEX1 = Regexp.new('<inputbox>')
|
26
|
+
IN_INPUTBOX_REGEX2 = Regexp.new('<\/inputbox>')
|
27
|
+
IN_SOURCE_REGEX = Regexp.new('<source.*?>.*?<\/source>')
|
28
|
+
IN_SOURCE_REGEX1 = Regexp.new('<source.*?>')
|
29
|
+
IN_SOURCE_REGEX2 = Regexp.new('<\/source>')
|
30
|
+
IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
|
31
|
+
IN_MATH_REGEX1 = Regexp.new('<math.*?>')
|
32
|
+
IN_MATH_REGEX2 = Regexp.new('<\/math>')
|
33
|
+
IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
|
34
|
+
IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
|
35
|
+
IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
|
36
|
+
IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
|
37
|
+
IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
|
38
|
+
IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
|
39
|
+
IN_UNORDERED_REGEX = Regexp.new('^\*')
|
40
|
+
IN_ORDERED_REGEX = Regexp.new('^\#')
|
41
|
+
IN_PRE_REGEX = Regexp.new('^ ')
|
42
|
+
IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
|
43
|
+
BLANK_LINE_REGEX = Regexp.new('^\s*$')
|
44
|
+
REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
45
|
+
REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
|
46
|
+
REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
|
47
|
+
REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
48
|
+
CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
49
|
+
MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
|
50
|
+
REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
|
51
|
+
MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
|
52
|
+
MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
|
53
|
+
MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
|
54
|
+
MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
|
55
|
+
FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
56
|
+
HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
|
57
|
+
HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
|
58
|
+
LIST_MARKS_REGEX = Regexp.new('\A[\*\#\;\:\ ]+')
|
59
|
+
PRE_MARKS_REGEX = Regexp.new('\A\^\ ')
|
60
|
+
DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
|
61
|
+
ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
|
62
|
+
|
63
|
+
CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
|
64
|
+
CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
65
|
+
|
66
|
+
ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
67
|
+
UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
|
68
|
+
|
69
|
+
REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
70
|
+
REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
|
71
|
+
TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
72
|
+
|
73
|
+
SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
|
74
|
+
DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
|
75
|
+
SINGLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
|
76
|
+
DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
|
77
|
+
CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
|
78
|
+
|
79
|
+
COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
80
|
+
COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
81
|
+
COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
82
|
+
COMPLEX_REGEX_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
+
COMPLEX_REGEX_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
84
|
+
|
85
|
+
CLEANUP_REGEX_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
86
|
+
CLEANUP_REGEX_02 = Regexp.new('^File:.+$')
|
87
|
+
CLEANUP_REGEX_03 = Regexp.new('^\|.*$')
|
88
|
+
CLEANUP_REGEX_04 = Regexp.new('\{\{.*$')
|
89
|
+
CLEANUP_REGEX_05 = Regexp.new('^.*\}\}')
|
90
|
+
CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
|
91
|
+
CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
|
92
|
+
CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
93
|
+
end
|