infoboxer 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/ci.yml +32 -0
  3. data/.rubocop_todo.yml +0 -15
  4. data/CHANGELOG.md +43 -0
  5. data/Gemfile.lock +172 -0
  6. data/README.md +1 -1
  7. data/infoboxer.gemspec +1 -1
  8. data/lib/infoboxer.rb +23 -11
  9. data/lib/infoboxer/core_ext.rb +1 -1
  10. data/lib/infoboxer/definitions/en.wikipedia.org.rb +3 -1
  11. data/lib/infoboxer/media_wiki.rb +83 -65
  12. data/lib/infoboxer/media_wiki/page.rb +10 -1
  13. data/lib/infoboxer/media_wiki/traits.rb +69 -22
  14. data/lib/infoboxer/navigation.rb +7 -1
  15. data/lib/infoboxer/navigation/lookup.rb +15 -7
  16. data/lib/infoboxer/navigation/sections.rb +27 -9
  17. data/lib/infoboxer/navigation/selector.rb +14 -6
  18. data/lib/infoboxer/navigation/shortcuts.rb +1 -1
  19. data/lib/infoboxer/navigation/wikipath.rb +1 -1
  20. data/lib/infoboxer/parser.rb +2 -2
  21. data/lib/infoboxer/parser/context.rb +23 -9
  22. data/lib/infoboxer/parser/html.rb +1 -1
  23. data/lib/infoboxer/parser/image.rb +2 -2
  24. data/lib/infoboxer/parser/inline.rb +50 -7
  25. data/lib/infoboxer/parser/paragraphs.rb +3 -3
  26. data/lib/infoboxer/parser/table.rb +33 -17
  27. data/lib/infoboxer/parser/template.rb +5 -4
  28. data/lib/infoboxer/parser/util.rb +2 -1
  29. data/lib/infoboxer/templates.rb +2 -0
  30. data/lib/infoboxer/templates/base.rb +2 -0
  31. data/lib/infoboxer/templates/set.rb +1 -1
  32. data/lib/infoboxer/tree.rb +2 -2
  33. data/lib/infoboxer/tree/compound.rb +3 -3
  34. data/lib/infoboxer/tree/document.rb +1 -1
  35. data/lib/infoboxer/tree/gallery.rb +12 -0
  36. data/lib/infoboxer/tree/html.rb +3 -3
  37. data/lib/infoboxer/tree/image.rb +4 -4
  38. data/lib/infoboxer/tree/inline.rb +3 -3
  39. data/lib/infoboxer/tree/linkable.rb +6 -1
  40. data/lib/infoboxer/tree/list.rb +4 -5
  41. data/lib/infoboxer/tree/math.rb +2 -3
  42. data/lib/infoboxer/tree/node.rb +4 -4
  43. data/lib/infoboxer/tree/nodes.rb +51 -7
  44. data/lib/infoboxer/tree/paragraphs.rb +1 -1
  45. data/lib/infoboxer/tree/ref.rb +1 -1
  46. data/lib/infoboxer/tree/table.rb +4 -4
  47. data/lib/infoboxer/tree/template.rb +18 -5
  48. data/lib/infoboxer/tree/text.rb +11 -11
  49. data/lib/infoboxer/tree/wikilink.rb +16 -8
  50. data/lib/infoboxer/version.rb +4 -3
  51. data/lib/infoboxer/wiki_path.rb +12 -1
  52. data/regression/pages/2012_bdo_world_darts_championship.wiki +941 -0
  53. data/regression/pages/progress_wrestling.wiki +1308 -0
  54. metadata +12 -8
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Infoboxer
2
4
  module Tree
3
5
  # Module included into everything, that can be treated as
@@ -15,7 +17,7 @@ module Infoboxer
15
17
  # * {Tree::Nodes#follow} for extracting multiple links at once;
16
18
  # * {MediaWiki#get} for basic information on page extraction.
17
19
  def follow
18
- client.get(link)
20
+ client.get(link, interwiki: interwiki)
19
21
  end
20
22
 
21
23
  # Human-readable page URL
@@ -28,6 +30,9 @@ module Infoboxer
28
30
 
29
31
  protected
30
32
 
33
+ # redefined in {Wikilink}
34
+ def interwiki; end
35
+
31
36
  def page
32
37
  lookup_parents(MediaWiki::Page).first or fail('Not in a page from real source')
33
38
  end
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  module Infoboxer
4
4
  module Tree
@@ -15,9 +15,8 @@ module Infoboxer
15
15
  # Internal, used by {Parser}
16
16
  def merge!(other)
17
17
  ochildren = other.children.dup
18
- if children.last && children.last.can_merge?(ochildren.first)
19
- children.last.merge!(ochildren.shift)
20
- end
18
+ children.last.merge!(ochildren.shift) \
19
+ if children.last&.can_merge?(ochildren.first)
21
20
  push_children(*ochildren)
22
21
  end
23
22
 
@@ -81,7 +80,7 @@ module Infoboxer
81
80
  # Represents ordered list (list with numbers).
82
81
  class OrderedList < List
83
82
  def make_marker(item)
84
- list_text_indent + "#{(item.index + 1)}. "
83
+ list_text_indent + "#{item.index + 1}. "
85
84
  end
86
85
  end
87
86
 
@@ -1,12 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Infoboxer
2
4
  module Tree
3
5
  # Represents node of math formulae marked with TeX
4
6
  #
5
7
  # See also: https://en.wikipedia.org/wiki/Help:Displaying_a_formula
6
8
  class Math < Text
7
- def text
8
- "<math>#{super}</math>"
9
- end
10
9
  end
11
10
  end
12
11
  end
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require 'htmlentities'
4
4
 
@@ -11,7 +11,7 @@ module Infoboxer
11
11
  # you will receive it from tree and use for navigations.
12
12
  #
13
13
  class Node
14
- def initialize(params = {})
14
+ def initialize(**params)
15
15
  @params = params
16
16
  end
17
17
 
@@ -154,7 +154,7 @@ module Infoboxer
154
154
  end
155
155
 
156
156
  def show_params(prms = nil)
157
- (prms || params).map { |k, v| "#{k}: #{v.inspect}" }.join(', ')
157
+ (prms || params).compact.map { |k, v| "#{k}: #{v.inspect}" }.join(', ')
158
158
  end
159
159
 
160
160
  def indent(level)
@@ -162,7 +162,7 @@ module Infoboxer
162
162
  end
163
163
 
164
164
  def _eq(_other)
165
- fail(NotImplementedError, "#_eq should be defined in subclasses (called for #{self.class})")
165
+ false
166
166
  end
167
167
 
168
168
  def decode(str)
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  module Infoboxer
4
4
  module Tree
@@ -38,15 +38,32 @@ module Infoboxer
38
38
  # @!method compact
39
39
  # Just like Array#compact, but returns Nodes
40
40
 
41
+ # @!method grep(pattern)
42
+ # Just like Array#grep, but returns Nodes
43
+
44
+ # @!method grep_v(pattern)
45
+ # Just like Array#grep_v, but returns Nodes
46
+
41
47
  # @!method -(other)
42
48
  # Just like Array#-, but returns Nodes
43
49
 
44
- %i[select reject sort_by flatten compact -].each do |sym|
50
+ # @!method +(other)
51
+ # Just like Array#+, but returns Nodes
52
+
53
+ # NB: Since Ruby 3.0, we need to redefine all Enumerable methods (otherwise they return Array).
54
+ # TODO: Check those lacking overrides!
55
+
56
+ %i[
57
+ select reject sort_by flatten compact grep grep_v - +
58
+ take_while drop_while
59
+ ].each do |sym|
45
60
  define_method(sym) do |*args, &block|
46
61
  Nodes[*super(*args, &block)]
47
62
  end
48
63
  end
49
64
 
65
+ alias_method :filter, :select
66
+
50
67
  # Just like Array#first, but returns Nodes, if provided with `n` of elements.
51
68
  def first(n = nil)
52
69
  if n.nil?
@@ -75,6 +92,21 @@ module Infoboxer
75
92
  end
76
93
  end
77
94
 
95
+ # Just like Array#flat_map, but returns Nodes, **if** all map results are Node
96
+ def flat_map
97
+ res = super
98
+ if res.all? { |n| n.is_a?(Node) || n.is_a?(Nodes) }
99
+ Nodes[*res]
100
+ else
101
+ res
102
+ end
103
+ end
104
+
105
+ # Just like Array#group, but returns hash with `{<grouping variable> => Nodes}`
106
+ def group_by
107
+ super.transform_values { |group| Nodes[*group] }
108
+ end
109
+
78
110
  # @!method prev_siblings
79
111
  # Previous siblings (flat list) of all nodes inside.
80
112
 
@@ -129,6 +161,12 @@ module Infoboxer
129
161
  map(&:text).join
130
162
  end
131
163
 
164
+ alias_method :to_s, :text
165
+
166
+ def unwrap
167
+ map { |n| n.respond_to?(:unwrap) ? n.unwrap : n }
168
+ end
169
+
132
170
  # Fetches pages by ALL wikilinks inside in ONE query to MediaWiki
133
171
  # API.
134
172
  #
@@ -139,23 +177,27 @@ module Infoboxer
139
177
  # @return [Nodes<MediaWiki::Page>] It is still `Nodes`, so you
140
178
  # still can process them uniformely.
141
179
  def follow
142
- links = select { |n| n.respond_to?(:link) }.map(&:link)
180
+ links = grep(Linkable)
143
181
  return Nodes[] if links.empty?
182
+
144
183
  page = first.lookup_parents(MediaWiki::Page).first or
145
184
  fail('Not in a page from real source')
146
185
  page.client or fail('MediaWiki client not set')
147
- page.client.get(*links)
186
+ pages = links.group_by(&:interwiki)
187
+ .flat_map { |iw, ls| page.client.get(*ls.map(&:link), interwiki: iw) }
188
+ pages.count == 1 ? pages.first : Nodes[*pages]
148
189
  end
149
190
 
150
191
  # @private
151
192
  # Internal, used by {Parser}
152
- def <<(node)
193
+ def <<(node) # rubocop:disable Metrics/PerceivedComplexity
153
194
  if node.is_a?(Array)
154
195
  node.each { |n| self << n }
155
- elsif last && last.can_merge?(node)
196
+ elsif last&.can_merge?(node)
156
197
  last.merge!(node)
157
198
  else
158
199
  return if !node || node.empty?
200
+
159
201
  node = Text.new(node) if node.is_a?(String)
160
202
  super
161
203
  end
@@ -173,7 +215,9 @@ module Infoboxer
173
215
  # @private
174
216
  # Internal, used by {Parser}
175
217
  def flow_templates
176
- make_nodes(map { |n| n.is_a?(Paragraph) ? n.to_templates? : n })
218
+ # TODO: will it be better?..
219
+ # make_nodes(map { |n| n.is_a?(Paragraph) ? n.to_templates? : n })
220
+ self
177
221
  end
178
222
 
179
223
  private
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  module Infoboxer
4
4
  module Tree
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  module Infoboxer
4
4
  module Tree
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require 'terminal-table'
4
4
 
@@ -26,13 +26,13 @@ module Infoboxer
26
26
  #
27
27
  # FIXME: it can easily be several table heading rows
28
28
  def heading_row
29
- rows.first if rows.first && rows.first.children.all? { |c| c.is_a?(TableHeading) }
29
+ rows.first if rows.first&.children&.all? { |c| c.is_a?(TableHeading) }
30
30
  end
31
31
 
32
32
  # For now, returns all table rows except {#heading_row}
33
33
  def body_rows
34
- if rows.first && rows.first.children.all? { |c| c.is_a?(TableHeading) }
35
- rows[1..-1]
34
+ if rows.first&.children&.all? { |c| c.is_a?(TableHeading) }
35
+ rows[1..]
36
36
  else
37
37
  rows
38
38
  end
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require_relative 'linkable'
4
4
 
@@ -22,6 +22,10 @@ module Infoboxer
22
22
  false
23
23
  end
24
24
 
25
+ def named?
26
+ name !~ /^\d+$/
27
+ end
28
+
25
29
  protected
26
30
 
27
31
  def descr
@@ -110,12 +114,17 @@ module Infoboxer
110
114
  alias_method :variables, :children
111
115
 
112
116
  def initialize(name, variables = Nodes[])
113
- super(variables, extract_params(variables))
117
+ super(variables, **extract_params(variables))
114
118
  @name = name
115
119
  end
116
120
 
117
121
  def text
118
- ''
122
+ res = unnamed_variables.map(&:text).join('|')
123
+ res.empty? ? '' : "{#{name}:#{res}}"
124
+ end
125
+
126
+ def unwrap
127
+ unnamed_variables.flat_map(&:children).unwrap
119
128
  end
120
129
 
121
130
  # See {Node#to_tree}
@@ -139,7 +148,11 @@ module Infoboxer
139
148
  #
140
149
  # @return [Nodes<Var>]
141
150
  def unnamed_variables
142
- variables.find(name: /^\d+$/)
151
+ variables.reject(&:named?)
152
+ end
153
+
154
+ def named_variables
155
+ variables.select(&:named?)
143
156
  end
144
157
 
145
158
  # Fetches template variable(s) by name(s) or patterns.
@@ -242,7 +255,7 @@ module Infoboxer
242
255
  def extract_params(vars)
243
256
  vars
244
257
  .select { |v| v.children.count == 1 && v.children.first.is_a?(Text) }
245
- .map { |v| [v.name, v.children.first.raw_text] }.to_h
258
+ .map { |v| [v.name.to_sym, v.children.first.raw_text] }.to_h
246
259
  end
247
260
 
248
261
  def inspect_variables(depth)
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  module Infoboxer
4
4
  module Tree
@@ -15,9 +15,9 @@ module Infoboxer
15
15
  # Text fragment without decodint of HTML entities.
16
16
  attr_accessor :raw_text
17
17
 
18
- def initialize(text, params = {})
19
- super(params)
20
- @raw_text = text
18
+ def initialize(text, **params)
19
+ super(**params)
20
+ @raw_text = +text
21
21
  end
22
22
 
23
23
  # See {Node#text}
@@ -39,13 +39,13 @@ module Infoboxer
39
39
  # @private
40
40
  # Internal, used by {Parser}
41
41
  def merge!(other)
42
- if other.is_a?(String)
43
- @raw_text << other
44
- elsif other.is_a?(Text)
45
- @raw_text << other.raw_text
46
- else
47
- fail("Not mergeable into text: #{other.inspect}")
48
- end
42
+ @raw_text <<
43
+ case other
44
+ when String then other
45
+ when Text then other.raw_text
46
+ else
47
+ fail("Not mergeable into text: #{other.inspect}")
48
+ end
49
49
  end
50
50
 
51
51
  # @private
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require_relative 'linkable'
4
4
 
@@ -12,14 +12,23 @@ module Infoboxer
12
12
  # Note, that Wikilink is {Linkable}, so you can {Linkable#follow #follow}
13
13
  # it to obtain linked pages.
14
14
  class Wikilink < Link
15
- def initialize(*)
16
- super
17
- parse_link!
15
+ def initialize(link, label = nil, namespace: nil, interwiki: nil)
16
+ super(link, label, namespace: namespace, interwiki: interwiki)
17
+ @namespace = namespace || ''
18
+ @interwiki = interwiki
19
+ parse_name!
18
20
  end
19
21
 
20
22
  # "Clean" wikilink name, for ex., `Cities` for `[Category:Cities]`
21
23
  attr_reader :name
22
24
 
25
+ # Interwiki identifier. For example, `[[wikt:Argentina]]`
26
+ # will have `"Argentina"` as its {#name} and `"wikt"` (wiktionary) as an
27
+ # interwiki. TODO: how to use it.
28
+ #
29
+ # See [Wikipedia docs](https://en.wikipedia.org/wiki/Help:Interwiki_linking) for details.
30
+ attr_reader :interwiki
31
+
23
32
  # Wikilink namespace, `Category` for `[Category:Cities]`, empty
24
33
  # string (not `nil`!) for just `[Cities]`
25
34
  attr_reader :namespace
@@ -46,10 +55,8 @@ module Infoboxer
46
55
 
47
56
  private
48
57
 
49
- def parse_link!
50
- @name, @namespace = link.split(':', 2).reverse
51
- @namespace ||= ''
52
-
58
+ def parse_name!
59
+ @name = namespace.empty? ? link : link.sub(/^#{namespace}:/, '')
53
60
  @name, @anchor = @name.split('#', 2)
54
61
  @anchor ||= ''
55
62
 
@@ -68,6 +75,7 @@ module Infoboxer
68
75
 
69
76
  return unless children.count == 1 &&
70
77
  children.first.is_a?(Text) && children.first.raw_text.empty?
78
+
71
79
  children.first.raw_text = @topic
72
80
  end
73
81
  end
@@ -1,8 +1,9 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  module Infoboxer
4
4
  MAJOR = 0
5
- MINOR = 3
5
+ MINOR = 4
6
6
  PATCH = 0
7
- VERSION = [MAJOR, MINOR, PATCH].join('.')
7
+ PRE = nil
8
+ VERSION = [MAJOR, MINOR, PATCH, PRE].compact.join('.')
8
9
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Infoboxer
2
4
  # @private
3
5
  class WikiPath
@@ -36,7 +38,7 @@ module Infoboxer
36
38
  attrs[attr.to_sym] = process_value(value)
37
39
  end
38
40
  res = op == '//' ? {op: :lookup} : {}
39
- res[:type] = type.gsub(/(?:^|_)([a-z])/, &:upcase).tr('_', '').to_sym unless type.empty?
41
+ res[:type] = process_type(type) unless type.empty?
40
42
  res.merge(attrs) # TODO: raise if empty selector
41
43
  end
42
44
 
@@ -51,6 +53,15 @@ module Infoboxer
51
53
  end
52
54
  end
53
55
 
56
+ def process_type(type)
57
+ type.gsub(/(?:^|_)([a-z])/, &:upcase).tr('_', '').to_sym
58
+ .tap { |t| valid_type?(t) or fail(ParseError, "Unrecognized node type: #{type}") }
59
+ end
60
+
61
+ def valid_type?(t)
62
+ t == :Section || Infoboxer::Tree.const_defined?(t)
63
+ end
64
+
54
65
  def unexpected(scanner, expected)
55
66
  place = scanner.eos? ? 'end of pattern' : scanner.rest.inspect
56
67
  fail ParseError, "Unexpected #{place}, expecting #{expected}"