wp2txt 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,62 +1,54 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
5
-
1
+ # frozen_string_literal: true
6
2
 
7
3
  require 'strscan'
8
- require 'utils'
4
+ require_relative 'utils'
9
5
 
10
6
  module Wp2txt
11
-
12
7
  # possible element type, which could be later chosen to print or not to print
13
- # :mw_heading
14
- # :mw_htable
15
- # :mw_quote
16
- # :mw_unordered
17
- # :mw_ordered
18
- # :mw_definition
19
- # :mw_pre
20
- # :mw_paragraph
21
- # :mw_comment
22
- # :mw_math
23
- # :mw_source
24
- # :mw_inputbox
25
- # :mw_template
26
- # :mw_link
27
- # :mw_summary
28
- # :mw_blank
29
- # :mw_redirect
8
+ # :mw_heading
9
+ # :mw_htable
10
+ # :mw_quote
11
+ # :mw_unordered
12
+ # :mw_ordered
13
+ # :mw_definition
14
+ # :mw_pre
15
+ # :mw_paragraph
16
+ # :mw_comment
17
+ # :mw_math
18
+ # :mw_source
19
+ # :mw_inputbox
20
+ # :mw_template
21
+ # :mw_link
22
+ # :mw_summary
23
+ # :mw_blank
24
+ # :mw_redirect
30
25
 
31
26
  # an article contains elements, each of which is [TYPE, string]
32
27
  class Article
33
-
34
28
  include Wp2txt
35
29
  attr_accessor :elements, :title, :categories
36
-
30
+
37
31
  def initialize(text, title = "", strip_tmarker = false)
38
32
  @title = title.strip
39
33
  @strip_tmarker = strip_tmarker
40
- convert_characters!(text)
41
- text.gsub!(/\|\n\n+/m){"|\n"}
42
- remove_html!(text)
43
- make_reference!(text)
44
- remove_ref!(text)
34
+ text = convert_characters(text)
35
+ text = text.gsub(/\|\n\n+/m) { "|\n" }
36
+ text = remove_html(text)
37
+ text = make_reference(text)
38
+ text = remove_ref(text)
45
39
  parse text
46
40
  end
47
-
48
- def create_element(tp, text)
49
- [tp, text]
41
+
42
+ def create_element(tpx, text)
43
+ [tpx, text]
50
44
  end
51
-
45
+
52
46
  def parse(source)
53
47
  @elements = []
54
- @categories = []
48
+ @categories = []
55
49
  mode = nil
56
- open_stack = []
57
- close_stack = []
58
50
  source.each_line do |line|
59
- matched = line.scan($category_regex)
51
+ matched = line.scan(CATEGORY_REGEX)
60
52
  if matched && !matched.empty?
61
53
  @categories += matched
62
54
  @categories.uniq!
@@ -65,108 +57,94 @@ module Wp2txt
65
57
  case mode
66
58
  when :mw_ml_template
67
59
  scanner = StringScanner.new(line)
68
- str= process_nested_structure(scanner, "{{", "}}") {""}
69
- if $ml_template_end_regex =~ str
70
- mode = nil
71
- end
60
+ str = process_nested_structure(scanner, "{{", "}}") { "" }
61
+ mode = nil if ML_TEMPLATE_END_REGEX =~ str
72
62
  @elements.last.last << line
73
63
  next
74
64
  when :mw_ml_link
75
65
  scanner = StringScanner.new(line)
76
- str= process_nested_structure(scanner, "[[", "]]") {""}
77
- if $ml_link_end_regex =~ str
78
- mode = nil
79
- end
66
+ str = process_nested_structure(scanner, "[[", "]]") { "" }
67
+ mode = nil if ML_LINK_END_REGEX =~ str
80
68
  @elements.last.last << line
81
69
  next
82
70
  when :mw_table
83
- if $in_table_regex2 =~ line
84
- mode = nil
85
- end
71
+ mode = nil if IN_TABLE_REGEX2 =~ line
86
72
  @elements.last.last << line
87
- next
73
+ next
88
74
  when :mw_inputbox
89
- if $in_inputbox_regex2 =~ line
90
- mode = nil
91
- end
75
+ mode = nil if IN_INPUTBOX_REGEX2 =~ line
92
76
  @elements.last.last << line
93
77
  next
94
78
  when :mw_source
95
- if $in_source_regex2 =~ line
96
- mode = nil
97
- end
79
+ mode = nil if IN_SOURCE_REGEX2 =~ line
98
80
  @elements.last.last << line
99
81
  next
100
82
  when :mw_math
101
- if $in_math_regex2 =~ line
102
- mode = nil
103
- end
83
+ mode = nil if IN_MATH_REGEX2 =~ line
104
84
  @elements.last.last << line
105
85
  next
106
86
  when :mw_htable
107
- if $in_html_table_regex2 =~ line
108
- mode = nil
109
- end
87
+ mode = nil if IN_HTML_TABLE_REGEX2 =~ line
110
88
  @elements.last.last << line
111
89
  next
112
90
  end
113
91
 
114
92
  case line
115
- when $isolated_template_regex
93
+ when ISOLATED_TEMPLATE_REGEX
116
94
  @elements << create_element(:mw_isolated_template, line)
117
- when $isolated_tag_regex
95
+ when ISOLATED_TAG_REGEX
118
96
  @elements << create_element(:mw_isolated_tag, line)
119
- when $blank_line_regex
120
- @elements << create_element(:mw_blank, "\n")
121
- when $redirect_regex
97
+ when BLANK_LINE_REGEX
98
+ @elements << create_element(:mw_blank, "\n")
99
+ when REDIRECT_REGEX
122
100
  @elements << create_element(:mw_redirect, line)
123
- when $in_heading_regex
124
- line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
101
+ when IN_HEADING_REGEX
102
+ line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
125
103
  @elements << create_element(:mw_heading, "\n" + line + "\n")
126
- when $in_inputbox_regex
104
+ when IN_INPUTBOX_REGEX
127
105
  @elements << create_element(:mw_inputbox, line)
128
- when $ml_template_onset_regex
106
+ when ML_TEMPLATE_ONSET_REGEX
129
107
  @elements << create_element(:mw_ml_template, line)
130
108
  mode = :mw_ml_template
131
- when $ml_link_onset_regex
109
+ when ML_LINK_ONSET_REGEX
132
110
  @elements << create_element(:mw_ml_link, line)
133
111
  mode = :mw_ml_link
134
- when $in_inputbox_regex1
112
+ when IN_INPUTBOX_REGEX1
135
113
  mode = :mw_inputbox
136
114
  @elements << create_element(:mw_inputbox, line)
137
- when $in_source_regex
138
- @elements << create_element(:mw_source, line)
139
- when $in_source_regex1
115
+ when IN_SOURCE_REGEX
116
+ @elements << create_element(:mw_source, line)
117
+ when IN_SOURCE_REGEX1
140
118
  mode = :mw_source
141
119
  @elements << create_element(:mw_source, line)
142
- when $in_math_regex
120
+ when IN_MATH_REGEX
143
121
  @elements << create_element(:mw_math, line)
144
- when $in_math_regex1
122
+ when IN_MATH_REGEX1
145
123
  mode = :mw_math
146
124
  @elements << create_element(:mw_math, line)
147
- when $in_html_table_regex
125
+ when IN_HTML_TABLE_REGEX
148
126
  @elements << create_element(:mw_htable, line)
149
- when $in_html_table_regex1
127
+ when IN_HTML_TABLE_REGEX1
150
128
  mode = :mw_htable
151
129
  @elements << create_element(:mw_htable, line)
152
- when $in_table_regex1
130
+ when IN_TABLE_REGEX1
153
131
  mode = :mw_table
154
132
  @elements << create_element(:mw_table, line)
155
- when $in_unordered_regex
156
- line = line.sub($list_marks_regex, "") if @strip_tmarker
133
+ when IN_UNORDERED_REGEX
134
+ line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
157
135
  @elements << create_element(:mw_unordered, line)
158
- when $in_ordered_regex
159
- line = line.sub($list_marks_regex, "") if @strip_tmarker
136
+ when IN_ORDERED_REGEX
137
+ line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
160
138
  @elements << create_element(:mw_ordered, line)
161
- when $in_pre_regex
162
- line = line.sub($pre_marks_regex, "") if @strip_tmarker
139
+ when IN_PRE_REGEX
140
+ line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
163
141
  @elements << create_element(:mw_pre, line)
164
- when $in_definition_regex
165
- line = line.sub($def_marks_regex, "") if @strip_tmarker
142
+ when IN_DEFINITION_REGEX
143
+ line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
166
144
  @elements << create_element(:mw_definition, line)
167
- when $in_link_regex
145
+ when IN_LINK_REGEX
168
146
  @elements << create_element(:mw_link, line)
169
- else
147
+ else
170
148
  @elements << create_element(:mw_paragraph, "\n" + line)
171
149
  end
172
150
  end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "htmlentities"
4
+
5
+ module Wp2txt
6
+ ###################################################
7
+ # variables to save resource for generating regexps
8
+ # those with a trailing number 1 represent opening tag/markup
9
+ # those with a trailing number 2 represent closing tag/markup
10
+ # those without a trailing number contain both opening/closing tags/markups
11
+
12
+ HTML_DECODER = HTMLEntities.new
13
+
14
+ ENTITIES = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
15
+ HTML_HASH = Hash[*ENTITIES.flatten]
16
+ HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
17
+ ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
18
+ ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
19
+ ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
20
+ ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
21
+ ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
22
+ ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
23
+ IN_LINK_REGEX = Regexp.new('^\s*\[.*\]\s*$')
24
+ IN_INPUTBOX_REGEX = Regexp.new('<inputbox>.*?<\/inputbox>')
25
+ IN_INPUTBOX_REGEX1 = Regexp.new('<inputbox>')
26
+ IN_INPUTBOX_REGEX2 = Regexp.new('<\/inputbox>')
27
+ IN_SOURCE_REGEX = Regexp.new('<source.*?>.*?<\/source>')
28
+ IN_SOURCE_REGEX1 = Regexp.new('<source.*?>')
29
+ IN_SOURCE_REGEX2 = Regexp.new('<\/source>')
30
+ IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
31
+ IN_MATH_REGEX1 = Regexp.new('<math.*?>')
32
+ IN_MATH_REGEX2 = Regexp.new('<\/math>')
33
+ IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
34
+ IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
35
+ IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
36
+ IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
37
+ IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
38
+ IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
39
+ IN_UNORDERED_REGEX = Regexp.new('^\*')
40
+ IN_ORDERED_REGEX = Regexp.new('^\#')
41
+ IN_PRE_REGEX = Regexp.new('^ ')
42
+ IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
43
+ BLANK_LINE_REGEX = Regexp.new('^\s*$')
44
+ REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
45
+ REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
46
+ REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
47
+ REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
48
+ CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
49
+ MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
50
+ REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
51
+ MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
52
+ MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
53
+ MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
54
+ MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
55
+ FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
56
+ HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
57
+ HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
58
+ LIST_MARKS_REGEX = Regexp.new('\A[\*\#\;\:\ ]+')
59
+ PRE_MARKS_REGEX = Regexp.new('\A\^\ ')
60
+ DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
61
+ ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
62
+
63
+ CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
64
+ CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
65
+
66
+ ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
67
+ UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
68
+
69
+ REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
70
+ REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
71
+ TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
72
+
73
+ SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
74
+ DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
75
+ SINGLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
76
+ DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
77
+ CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
78
+
79
+ COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
80
+ COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
81
+ COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
82
+ COMPLEX_REGEX_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
83
+ COMPLEX_REGEX_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
84
+
85
+ CLEANUP_REGEX_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
86
+ CLEANUP_REGEX_02 = Regexp.new('^File:.+$')
87
+ CLEANUP_REGEX_03 = Regexp.new('^\|.*$')
88
+ CLEANUP_REGEX_04 = Regexp.new('\{\{.*$')
89
+ CLEANUP_REGEX_05 = Regexp.new('^.*\}\}')
90
+ CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
91
+ CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
92
+ CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
93
+ end