wp2txt 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,62 +1,54 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- $: << File.join(File.dirname(__FILE__))
5
-
1
+ # frozen_string_literal: true
6
2
 
7
3
  require 'strscan'
8
- require 'utils'
4
+ require_relative 'utils'
9
5
 
10
6
  module Wp2txt
11
-
12
7
  # possible element type, which could be later chosen to print or not to print
13
- # :mw_heading
14
- # :mw_htable
15
- # :mw_quote
16
- # :mw_unordered
17
- # :mw_ordered
18
- # :mw_definition
19
- # :mw_pre
20
- # :mw_paragraph
21
- # :mw_comment
22
- # :mw_math
23
- # :mw_source
24
- # :mw_inputbox
25
- # :mw_template
26
- # :mw_link
27
- # :mw_summary
28
- # :mw_blank
29
- # :mw_redirect
8
+ # :mw_heading
9
+ # :mw_htable
10
+ # :mw_quote
11
+ # :mw_unordered
12
+ # :mw_ordered
13
+ # :mw_definition
14
+ # :mw_pre
15
+ # :mw_paragraph
16
+ # :mw_comment
17
+ # :mw_math
18
+ # :mw_source
19
+ # :mw_inputbox
20
+ # :mw_template
21
+ # :mw_link
22
+ # :mw_summary
23
+ # :mw_blank
24
+ # :mw_redirect
30
25
 
31
26
  # an article contains elements, each of which is [TYPE, string]
32
27
  class Article
33
-
34
28
  include Wp2txt
35
29
  attr_accessor :elements, :title, :categories
36
-
30
+
37
31
  def initialize(text, title = "", strip_tmarker = false)
38
32
  @title = title.strip
39
33
  @strip_tmarker = strip_tmarker
40
- convert_characters!(text)
41
- text.gsub!(/\|\n\n+/m){"|\n"}
42
- remove_html!(text)
43
- make_reference!(text)
44
- remove_ref!(text)
34
+ text = convert_characters(text)
35
+ text = text.gsub(/\|\n\n+/m) { "|\n" }
36
+ text = remove_html(text)
37
+ text = make_reference(text)
38
+ text = remove_ref(text)
45
39
  parse text
46
40
  end
47
-
48
- def create_element(tp, text)
49
- [tp, text]
41
+
42
+ def create_element(tpx, text)
43
+ [tpx, text]
50
44
  end
51
-
45
+
52
46
  def parse(source)
53
47
  @elements = []
54
- @categories = []
48
+ @categories = []
55
49
  mode = nil
56
- open_stack = []
57
- close_stack = []
58
50
  source.each_line do |line|
59
- matched = line.scan($category_regex)
51
+ matched = line.scan(CATEGORY_REGEX)
60
52
  if matched && !matched.empty?
61
53
  @categories += matched
62
54
  @categories.uniq!
@@ -65,108 +57,94 @@ module Wp2txt
65
57
  case mode
66
58
  when :mw_ml_template
67
59
  scanner = StringScanner.new(line)
68
- str= process_nested_structure(scanner, "{{", "}}") {""}
69
- if $ml_template_end_regex =~ str
70
- mode = nil
71
- end
60
+ str = process_nested_structure(scanner, "{{", "}}") { "" }
61
+ mode = nil if ML_TEMPLATE_END_REGEX =~ str
72
62
  @elements.last.last << line
73
63
  next
74
64
  when :mw_ml_link
75
65
  scanner = StringScanner.new(line)
76
- str= process_nested_structure(scanner, "[[", "]]") {""}
77
- if $ml_link_end_regex =~ str
78
- mode = nil
79
- end
66
+ str = process_nested_structure(scanner, "[[", "]]") { "" }
67
+ mode = nil if ML_LINK_END_REGEX =~ str
80
68
  @elements.last.last << line
81
69
  next
82
70
  when :mw_table
83
- if $in_table_regex2 =~ line
84
- mode = nil
85
- end
71
+ mode = nil if IN_TABLE_REGEX2 =~ line
86
72
  @elements.last.last << line
87
- next
73
+ next
88
74
  when :mw_inputbox
89
- if $in_inputbox_regex2 =~ line
90
- mode = nil
91
- end
75
+ mode = nil if IN_INPUTBOX_REGEX2 =~ line
92
76
  @elements.last.last << line
93
77
  next
94
78
  when :mw_source
95
- if $in_source_regex2 =~ line
96
- mode = nil
97
- end
79
+ mode = nil if IN_SOURCE_REGEX2 =~ line
98
80
  @elements.last.last << line
99
81
  next
100
82
  when :mw_math
101
- if $in_math_regex2 =~ line
102
- mode = nil
103
- end
83
+ mode = nil if IN_MATH_REGEX2 =~ line
104
84
  @elements.last.last << line
105
85
  next
106
86
  when :mw_htable
107
- if $in_html_table_regex2 =~ line
108
- mode = nil
109
- end
87
+ mode = nil if IN_HTML_TABLE_REGEX2 =~ line
110
88
  @elements.last.last << line
111
89
  next
112
90
  end
113
91
 
114
92
  case line
115
- when $isolated_template_regex
93
+ when ISOLATED_TEMPLATE_REGEX
116
94
  @elements << create_element(:mw_isolated_template, line)
117
- when $isolated_tag_regex
95
+ when ISOLATED_TAG_REGEX
118
96
  @elements << create_element(:mw_isolated_tag, line)
119
- when $blank_line_regex
120
- @elements << create_element(:mw_blank, "\n")
121
- when $redirect_regex
97
+ when BLANK_LINE_REGEX
98
+ @elements << create_element(:mw_blank, "\n")
99
+ when REDIRECT_REGEX
122
100
  @elements << create_element(:mw_redirect, line)
123
- when $in_heading_regex
124
- line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
101
+ when IN_HEADING_REGEX
102
+ line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
125
103
  @elements << create_element(:mw_heading, "\n" + line + "\n")
126
- when $in_inputbox_regex
104
+ when IN_INPUTBOX_REGEX
127
105
  @elements << create_element(:mw_inputbox, line)
128
- when $ml_template_onset_regex
106
+ when ML_TEMPLATE_ONSET_REGEX
129
107
  @elements << create_element(:mw_ml_template, line)
130
108
  mode = :mw_ml_template
131
- when $ml_link_onset_regex
109
+ when ML_LINK_ONSET_REGEX
132
110
  @elements << create_element(:mw_ml_link, line)
133
111
  mode = :mw_ml_link
134
- when $in_inputbox_regex1
112
+ when IN_INPUTBOX_REGEX1
135
113
  mode = :mw_inputbox
136
114
  @elements << create_element(:mw_inputbox, line)
137
- when $in_source_regex
138
- @elements << create_element(:mw_source, line)
139
- when $in_source_regex1
115
+ when IN_SOURCE_REGEX
116
+ @elements << create_element(:mw_source, line)
117
+ when IN_SOURCE_REGEX1
140
118
  mode = :mw_source
141
119
  @elements << create_element(:mw_source, line)
142
- when $in_math_regex
120
+ when IN_MATH_REGEX
143
121
  @elements << create_element(:mw_math, line)
144
- when $in_math_regex1
122
+ when IN_MATH_REGEX1
145
123
  mode = :mw_math
146
124
  @elements << create_element(:mw_math, line)
147
- when $in_html_table_regex
125
+ when IN_HTML_TABLE_REGEX
148
126
  @elements << create_element(:mw_htable, line)
149
- when $in_html_table_regex1
127
+ when IN_HTML_TABLE_REGEX1
150
128
  mode = :mw_htable
151
129
  @elements << create_element(:mw_htable, line)
152
- when $in_table_regex1
130
+ when IN_TABLE_REGEX1
153
131
  mode = :mw_table
154
132
  @elements << create_element(:mw_table, line)
155
- when $in_unordered_regex
156
- line = line.sub($list_marks_regex, "") if @strip_tmarker
133
+ when IN_UNORDERED_REGEX
134
+ line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
157
135
  @elements << create_element(:mw_unordered, line)
158
- when $in_ordered_regex
159
- line = line.sub($list_marks_regex, "") if @strip_tmarker
136
+ when IN_ORDERED_REGEX
137
+ line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
160
138
  @elements << create_element(:mw_ordered, line)
161
- when $in_pre_regex
162
- line = line.sub($pre_marks_regex, "") if @strip_tmarker
139
+ when IN_PRE_REGEX
140
+ line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
163
141
  @elements << create_element(:mw_pre, line)
164
- when $in_definition_regex
165
- line = line.sub($def_marks_regex, "") if @strip_tmarker
142
+ when IN_DEFINITION_REGEX
143
+ line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
166
144
  @elements << create_element(:mw_definition, line)
167
- when $in_link_regex
145
+ when IN_LINK_REGEX
168
146
  @elements << create_element(:mw_link, line)
169
- else
147
+ else
170
148
  @elements << create_element(:mw_paragraph, "\n" + line)
171
149
  end
172
150
  end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "htmlentities"
4
+
5
+ module Wp2txt
6
+ ###################################################
7
+ # variables to save resource for generating regexps
8
+ # those with a trailing number 1 represent opening tag/markup
9
+ # those with a trailing number 2 represent closing tag/markup
10
+ # those without a trailing number contain both opening/closing tags/markups
11
+
12
+ HTML_DECODER = HTMLEntities.new
13
+
14
+ ENTITIES = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
15
+ HTML_HASH = Hash[*ENTITIES.flatten]
16
+ HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
17
+ ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
18
+ ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
19
+ ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
20
+ ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
21
+ ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
22
+ ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
23
+ IN_LINK_REGEX = Regexp.new('^\s*\[.*\]\s*$')
24
+ IN_INPUTBOX_REGEX = Regexp.new('<inputbox>.*?<\/inputbox>')
25
+ IN_INPUTBOX_REGEX1 = Regexp.new('<inputbox>')
26
+ IN_INPUTBOX_REGEX2 = Regexp.new('<\/inputbox>')
27
+ IN_SOURCE_REGEX = Regexp.new('<source.*?>.*?<\/source>')
28
+ IN_SOURCE_REGEX1 = Regexp.new('<source.*?>')
29
+ IN_SOURCE_REGEX2 = Regexp.new('<\/source>')
30
+ IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
31
+ IN_MATH_REGEX1 = Regexp.new('<math.*?>')
32
+ IN_MATH_REGEX2 = Regexp.new('<\/math>')
33
+ IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
34
+ IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
35
+ IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
36
+ IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
37
+ IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
38
+ IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
39
+ IN_UNORDERED_REGEX = Regexp.new('^\*')
40
+ IN_ORDERED_REGEX = Regexp.new('^\#')
41
+ IN_PRE_REGEX = Regexp.new('^ ')
42
+ IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
43
+ BLANK_LINE_REGEX = Regexp.new('^\s*$')
44
+ REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
45
+ REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
46
+ REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
47
+ REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
48
+ CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
49
+ MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
50
+ REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
51
+ MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
52
+ MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
53
+ MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
54
+ MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
55
+ FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
56
+ HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
57
+ HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
58
+ LIST_MARKS_REGEX = Regexp.new('\A[\*\#\;\:\ ]+')
59
+ PRE_MARKS_REGEX = Regexp.new('\A\^\ ')
60
+ DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
61
+ ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
62
+
63
+ CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
64
+ CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
65
+
66
+ ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
67
+ UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
68
+
69
+ REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
70
+ REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
71
+ TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
72
+
73
+ SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
74
+ DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
75
+ SINGLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
76
+ DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
77
+ CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
78
+
79
+ COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
80
+ COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
81
+ COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
82
+ COMPLEX_REGEX_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
83
+ COMPLEX_REGEX_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
84
+
85
+ CLEANUP_REGEX_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
86
+ CLEANUP_REGEX_02 = Regexp.new('^File:.+$')
87
+ CLEANUP_REGEX_03 = Regexp.new('^\|.*$')
88
+ CLEANUP_REGEX_04 = Regexp.new('\{\{.*$')
89
+ CLEANUP_REGEX_05 = Regexp.new('^.*\}\}')
90
+ CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
91
+ CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
92
+ CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
93
+ end