wp2txt 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "htmlentities"
4
+
5
+ module Wp2txt
6
+ ###################################################
7
+ # variables to save resource for generating regexps
8
+ # those with a trailing number 1 represent opening tag/markup
9
+ # those with a trailing number 2 represent closing tag/markup
10
+ # those without a trailing number contain both opening/closing tags/markups
11
+
12
+ HTML_DECODER = HTMLEntities.new
13
+
14
+ ENTITIES = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
15
+ HTML_HASH = Hash[*ENTITIES.flatten]
16
+ HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
17
+ ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
18
+ ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
19
+ ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
20
+ ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
21
+ ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
22
+ ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
23
+ IN_LINK_REGEX = Regexp.new('^\s*\[.*\]\s*$')
24
+ IN_INPUTBOX_REGEX = Regexp.new('<inputbox>.*?<\/inputbox>')
25
+ IN_INPUTBOX_REGEX1 = Regexp.new('<inputbox>')
26
+ IN_INPUTBOX_REGEX2 = Regexp.new('<\/inputbox>')
27
+ IN_SOURCE_REGEX = Regexp.new('<source.*?>.*?<\/source>')
28
+ IN_SOURCE_REGEX1 = Regexp.new('<source.*?>')
29
+ IN_SOURCE_REGEX2 = Regexp.new('<\/source>')
30
+ IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
31
+ IN_MATH_REGEX1 = Regexp.new('<math.*?>')
32
+ IN_MATH_REGEX2 = Regexp.new('<\/math>')
33
+ IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
34
+ IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
35
+ IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
36
+ IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
37
+ IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
38
+ IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
39
+ IN_UNORDERED_REGEX = Regexp.new('^\*')
40
+ IN_ORDERED_REGEX = Regexp.new('^\#')
41
+ IN_PRE_REGEX = Regexp.new('^ ')
42
+ IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
43
+ BLANK_LINE_REGEX = Regexp.new('^\s*$')
44
+ REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
45
+ REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
46
+ REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
47
+ REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
48
+ CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
49
+ MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
50
+ REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
51
+ MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
52
+ MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
53
+ MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
54
+ MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
55
+ FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
56
+ HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
57
+ HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
58
+ LIST_MARKS_REGEX = Regexp.new('\A[\*\#\;\:\ ]+')
59
+ PRE_MARKS_REGEX = Regexp.new('\A\^\ ')
60
+ DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
61
+ ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
62
+
63
+ CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
64
+ CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
65
+
66
+ ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
67
+ UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
68
+
69
+ REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
70
+ REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
71
+ TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
72
+
73
+ SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
74
+ DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
75
+ SINGLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
76
+ DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
77
+ CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
78
+
79
+ COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
80
+ COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
81
+ COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
82
+ COMPLEX_REGEX_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
83
+ COMPLEX_REGEX_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
84
+
85
+ CLEANUP_REGEX_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
86
+ CLEANUP_REGEX_02 = Regexp.new('^File:.+$')
87
+ CLEANUP_REGEX_03 = Regexp.new('^\|.*$')
88
+ CLEANUP_REGEX_04 = Regexp.new('\{\{.*$')
89
+ CLEANUP_REGEX_05 = Regexp.new('^.*\}\}')
90
+ CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
91
+ CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
92
+ CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
93
+ end