wp2txt 1.0.2 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +34 -6
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +129 -155
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/lib/wp2txt/regex.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "htmlentities"
|
4
|
+
|
5
|
+
module Wp2txt
|
6
|
+
###################################################
|
7
|
+
# variables to save resource for generating regexps
|
8
|
+
# those with a trailing number 1 represent opening tag/markup
|
9
|
+
# those with a trailing number 2 represent closing tag/markup
|
10
|
+
# those without a trailing number contain both opening/closing tags/markups
|
11
|
+
|
12
|
+
HTML_DECODER = HTMLEntities.new
|
13
|
+
|
14
|
+
ENTITIES = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
15
|
+
HTML_HASH = Hash[*ENTITIES.flatten]
|
16
|
+
HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
|
17
|
+
ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
|
18
|
+
ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
|
19
|
+
ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
|
20
|
+
ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
|
21
|
+
ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
22
|
+
ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
23
|
+
IN_LINK_REGEX = Regexp.new('^\s*\[.*\]\s*$')
|
24
|
+
IN_INPUTBOX_REGEX = Regexp.new('<inputbox>.*?<\/inputbox>')
|
25
|
+
IN_INPUTBOX_REGEX1 = Regexp.new('<inputbox>')
|
26
|
+
IN_INPUTBOX_REGEX2 = Regexp.new('<\/inputbox>')
|
27
|
+
IN_SOURCE_REGEX = Regexp.new('<source.*?>.*?<\/source>')
|
28
|
+
IN_SOURCE_REGEX1 = Regexp.new('<source.*?>')
|
29
|
+
IN_SOURCE_REGEX2 = Regexp.new('<\/source>')
|
30
|
+
IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
|
31
|
+
IN_MATH_REGEX1 = Regexp.new('<math.*?>')
|
32
|
+
IN_MATH_REGEX2 = Regexp.new('<\/math>')
|
33
|
+
IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
|
34
|
+
IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
|
35
|
+
IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
|
36
|
+
IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
|
37
|
+
IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
|
38
|
+
IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
|
39
|
+
IN_UNORDERED_REGEX = Regexp.new('^\*')
|
40
|
+
IN_ORDERED_REGEX = Regexp.new('^\#')
|
41
|
+
IN_PRE_REGEX = Regexp.new('^ ')
|
42
|
+
IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
|
43
|
+
BLANK_LINE_REGEX = Regexp.new('^\s*$')
|
44
|
+
REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
45
|
+
REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
|
46
|
+
REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
|
47
|
+
REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
48
|
+
CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
49
|
+
MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
|
50
|
+
REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
|
51
|
+
MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
|
52
|
+
MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
|
53
|
+
MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
|
54
|
+
MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
|
55
|
+
FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
56
|
+
HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
|
57
|
+
HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
|
58
|
+
LIST_MARKS_REGEX = Regexp.new('\A[\*\#\;\:\ ]+')
|
59
|
+
PRE_MARKS_REGEX = Regexp.new('\A\^\ ')
|
60
|
+
DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
|
61
|
+
ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
|
62
|
+
|
63
|
+
CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
|
64
|
+
CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
65
|
+
|
66
|
+
ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
67
|
+
UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
|
68
|
+
|
69
|
+
REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
70
|
+
REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
|
71
|
+
TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
72
|
+
|
73
|
+
SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
|
74
|
+
DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
|
75
|
+
SINGLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
|
76
|
+
DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
|
77
|
+
CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
|
78
|
+
|
79
|
+
COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
80
|
+
COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
81
|
+
COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
82
|
+
COMPLEX_REGEX_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
+
COMPLEX_REGEX_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
84
|
+
|
85
|
+
CLEANUP_REGEX_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
86
|
+
CLEANUP_REGEX_02 = Regexp.new('^File:.+$')
|
87
|
+
CLEANUP_REGEX_03 = Regexp.new('^\|.*$')
|
88
|
+
CLEANUP_REGEX_04 = Regexp.new('\{\{.*$')
|
89
|
+
CLEANUP_REGEX_05 = Regexp.new('^.*\}\}')
|
90
|
+
CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
|
91
|
+
CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
|
92
|
+
CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
93
|
+
end
|