infoboxer 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ci.yml +32 -0
- data/.rubocop_todo.yml +0 -15
- data/CHANGELOG.md +43 -0
- data/Gemfile.lock +172 -0
- data/README.md +1 -1
- data/infoboxer.gemspec +1 -1
- data/lib/infoboxer.rb +23 -11
- data/lib/infoboxer/core_ext.rb +1 -1
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +3 -1
- data/lib/infoboxer/media_wiki.rb +83 -65
- data/lib/infoboxer/media_wiki/page.rb +10 -1
- data/lib/infoboxer/media_wiki/traits.rb +69 -22
- data/lib/infoboxer/navigation.rb +7 -1
- data/lib/infoboxer/navigation/lookup.rb +15 -7
- data/lib/infoboxer/navigation/sections.rb +27 -9
- data/lib/infoboxer/navigation/selector.rb +14 -6
- data/lib/infoboxer/navigation/shortcuts.rb +1 -1
- data/lib/infoboxer/navigation/wikipath.rb +1 -1
- data/lib/infoboxer/parser.rb +2 -2
- data/lib/infoboxer/parser/context.rb +23 -9
- data/lib/infoboxer/parser/html.rb +1 -1
- data/lib/infoboxer/parser/image.rb +2 -2
- data/lib/infoboxer/parser/inline.rb +50 -7
- data/lib/infoboxer/parser/paragraphs.rb +3 -3
- data/lib/infoboxer/parser/table.rb +33 -17
- data/lib/infoboxer/parser/template.rb +5 -4
- data/lib/infoboxer/parser/util.rb +2 -1
- data/lib/infoboxer/templates.rb +2 -0
- data/lib/infoboxer/templates/base.rb +2 -0
- data/lib/infoboxer/templates/set.rb +1 -1
- data/lib/infoboxer/tree.rb +2 -2
- data/lib/infoboxer/tree/compound.rb +3 -3
- data/lib/infoboxer/tree/document.rb +1 -1
- data/lib/infoboxer/tree/gallery.rb +12 -0
- data/lib/infoboxer/tree/html.rb +3 -3
- data/lib/infoboxer/tree/image.rb +4 -4
- data/lib/infoboxer/tree/inline.rb +3 -3
- data/lib/infoboxer/tree/linkable.rb +6 -1
- data/lib/infoboxer/tree/list.rb +4 -5
- data/lib/infoboxer/tree/math.rb +2 -3
- data/lib/infoboxer/tree/node.rb +4 -4
- data/lib/infoboxer/tree/nodes.rb +51 -7
- data/lib/infoboxer/tree/paragraphs.rb +1 -1
- data/lib/infoboxer/tree/ref.rb +1 -1
- data/lib/infoboxer/tree/table.rb +4 -4
- data/lib/infoboxer/tree/template.rb +18 -5
- data/lib/infoboxer/tree/text.rb +11 -11
- data/lib/infoboxer/tree/wikilink.rb +16 -8
- data/lib/infoboxer/version.rb +4 -3
- data/lib/infoboxer/wiki_path.rb +12 -1
- data/regression/pages/2012_bdo_world_darts_championship.wiki +941 -0
- data/regression/pages/progress_wrestling.wiki +1308 -0
- metadata +12 -8
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Infoboxer
|
2
4
|
module Tree
|
3
5
|
# Module included into everything, that can be treated as
|
@@ -15,7 +17,7 @@ module Infoboxer
|
|
15
17
|
# * {Tree::Nodes#follow} for extracting multiple links at once;
|
16
18
|
# * {MediaWiki#get} for basic information on page extraction.
|
17
19
|
def follow
|
18
|
-
client.get(link)
|
20
|
+
client.get(link, interwiki: interwiki)
|
19
21
|
end
|
20
22
|
|
21
23
|
# Human-readable page URL
|
@@ -28,6 +30,9 @@ module Infoboxer
|
|
28
30
|
|
29
31
|
protected
|
30
32
|
|
33
|
+
# redefined in {Wikilink}
|
34
|
+
def interwiki; end
|
35
|
+
|
31
36
|
def page
|
32
37
|
lookup_parents(MediaWiki::Page).first or fail('Not in a page from real source')
|
33
38
|
end
|
data/lib/infoboxer/tree/list.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Tree
|
@@ -15,9 +15,8 @@ module Infoboxer
|
|
15
15
|
# Internal, used by {Parser}
|
16
16
|
def merge!(other)
|
17
17
|
ochildren = other.children.dup
|
18
|
-
|
19
|
-
children.last
|
20
|
-
end
|
18
|
+
children.last.merge!(ochildren.shift) \
|
19
|
+
if children.last&.can_merge?(ochildren.first)
|
21
20
|
push_children(*ochildren)
|
22
21
|
end
|
23
22
|
|
@@ -81,7 +80,7 @@ module Infoboxer
|
|
81
80
|
# Represents ordered list (list with numbers).
|
82
81
|
class OrderedList < List
|
83
82
|
def make_marker(item)
|
84
|
-
list_text_indent + "#{
|
83
|
+
list_text_indent + "#{item.index + 1}. "
|
85
84
|
end
|
86
85
|
end
|
87
86
|
|
data/lib/infoboxer/tree/math.rb
CHANGED
@@ -1,12 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Infoboxer
|
2
4
|
module Tree
|
3
5
|
# Represents node of math formulae marked with TeX
|
4
6
|
#
|
5
7
|
# See also: https://en.wikipedia.org/wiki/Help:Displaying_a_formula
|
6
8
|
class Math < Text
|
7
|
-
def text
|
8
|
-
"<math>#{super}</math>"
|
9
|
-
end
|
10
9
|
end
|
11
10
|
end
|
12
11
|
end
|
data/lib/infoboxer/tree/node.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'htmlentities'
|
4
4
|
|
@@ -11,7 +11,7 @@ module Infoboxer
|
|
11
11
|
# you will receive it from tree and use for navigations.
|
12
12
|
#
|
13
13
|
class Node
|
14
|
-
def initialize(params
|
14
|
+
def initialize(**params)
|
15
15
|
@params = params
|
16
16
|
end
|
17
17
|
|
@@ -154,7 +154,7 @@ module Infoboxer
|
|
154
154
|
end
|
155
155
|
|
156
156
|
def show_params(prms = nil)
|
157
|
-
(prms || params).map { |k, v| "#{k}: #{v.inspect}" }.join(', ')
|
157
|
+
(prms || params).compact.map { |k, v| "#{k}: #{v.inspect}" }.join(', ')
|
158
158
|
end
|
159
159
|
|
160
160
|
def indent(level)
|
@@ -162,7 +162,7 @@ module Infoboxer
|
|
162
162
|
end
|
163
163
|
|
164
164
|
def _eq(_other)
|
165
|
-
|
165
|
+
false
|
166
166
|
end
|
167
167
|
|
168
168
|
def decode(str)
|
data/lib/infoboxer/tree/nodes.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Tree
|
@@ -38,15 +38,32 @@ module Infoboxer
|
|
38
38
|
# @!method compact
|
39
39
|
# Just like Array#compact, but returns Nodes
|
40
40
|
|
41
|
+
# @!method grep(pattern)
|
42
|
+
# Just like Array#grep, but returns Nodes
|
43
|
+
|
44
|
+
# @!method grep_v(pattern)
|
45
|
+
# Just like Array#grep_v, but returns Nodes
|
46
|
+
|
41
47
|
# @!method -(other)
|
42
48
|
# Just like Array#-, but returns Nodes
|
43
49
|
|
44
|
-
|
50
|
+
# @!method +(other)
|
51
|
+
# Just like Array#+, but returns Nodes
|
52
|
+
|
53
|
+
# NB: Since Ruby 3.0, we need to redefine all Enumerable methods (otherwise they return Array).
|
54
|
+
# TODO: Check those lacking overrides!
|
55
|
+
|
56
|
+
%i[
|
57
|
+
select reject sort_by flatten compact grep grep_v - +
|
58
|
+
take_while drop_while
|
59
|
+
].each do |sym|
|
45
60
|
define_method(sym) do |*args, &block|
|
46
61
|
Nodes[*super(*args, &block)]
|
47
62
|
end
|
48
63
|
end
|
49
64
|
|
65
|
+
alias_method :filter, :select
|
66
|
+
|
50
67
|
# Just like Array#first, but returns Nodes, if provided with `n` of elements.
|
51
68
|
def first(n = nil)
|
52
69
|
if n.nil?
|
@@ -75,6 +92,21 @@ module Infoboxer
|
|
75
92
|
end
|
76
93
|
end
|
77
94
|
|
95
|
+
# Just like Array#flat_map, but returns Nodes, **if** all map results are Node
|
96
|
+
def flat_map
|
97
|
+
res = super
|
98
|
+
if res.all? { |n| n.is_a?(Node) || n.is_a?(Nodes) }
|
99
|
+
Nodes[*res]
|
100
|
+
else
|
101
|
+
res
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Just like Array#group, but returns hash with `{<grouping variable> => Nodes}`
|
106
|
+
def group_by
|
107
|
+
super.transform_values { |group| Nodes[*group] }
|
108
|
+
end
|
109
|
+
|
78
110
|
# @!method prev_siblings
|
79
111
|
# Previous siblings (flat list) of all nodes inside.
|
80
112
|
|
@@ -129,6 +161,12 @@ module Infoboxer
|
|
129
161
|
map(&:text).join
|
130
162
|
end
|
131
163
|
|
164
|
+
alias_method :to_s, :text
|
165
|
+
|
166
|
+
def unwrap
|
167
|
+
map { |n| n.respond_to?(:unwrap) ? n.unwrap : n }
|
168
|
+
end
|
169
|
+
|
132
170
|
# Fetches pages by ALL wikilinks inside in ONE query to MediaWiki
|
133
171
|
# API.
|
134
172
|
#
|
@@ -139,23 +177,27 @@ module Infoboxer
|
|
139
177
|
# @return [Nodes<MediaWiki::Page>] It is still `Nodes`, so you
|
140
178
|
# still can process them uniformely.
|
141
179
|
def follow
|
142
|
-
links =
|
180
|
+
links = grep(Linkable)
|
143
181
|
return Nodes[] if links.empty?
|
182
|
+
|
144
183
|
page = first.lookup_parents(MediaWiki::Page).first or
|
145
184
|
fail('Not in a page from real source')
|
146
185
|
page.client or fail('MediaWiki client not set')
|
147
|
-
|
186
|
+
pages = links.group_by(&:interwiki)
|
187
|
+
.flat_map { |iw, ls| page.client.get(*ls.map(&:link), interwiki: iw) }
|
188
|
+
pages.count == 1 ? pages.first : Nodes[*pages]
|
148
189
|
end
|
149
190
|
|
150
191
|
# @private
|
151
192
|
# Internal, used by {Parser}
|
152
|
-
def <<(node)
|
193
|
+
def <<(node) # rubocop:disable Metrics/PerceivedComplexity
|
153
194
|
if node.is_a?(Array)
|
154
195
|
node.each { |n| self << n }
|
155
|
-
elsif last
|
196
|
+
elsif last&.can_merge?(node)
|
156
197
|
last.merge!(node)
|
157
198
|
else
|
158
199
|
return if !node || node.empty?
|
200
|
+
|
159
201
|
node = Text.new(node) if node.is_a?(String)
|
160
202
|
super
|
161
203
|
end
|
@@ -173,7 +215,9 @@ module Infoboxer
|
|
173
215
|
# @private
|
174
216
|
# Internal, used by {Parser}
|
175
217
|
def flow_templates
|
176
|
-
|
218
|
+
# TODO: will it be better?..
|
219
|
+
# make_nodes(map { |n| n.is_a?(Paragraph) ? n.to_templates? : n })
|
220
|
+
self
|
177
221
|
end
|
178
222
|
|
179
223
|
private
|
data/lib/infoboxer/tree/ref.rb
CHANGED
data/lib/infoboxer/tree/table.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'terminal-table'
|
4
4
|
|
@@ -26,13 +26,13 @@ module Infoboxer
|
|
26
26
|
#
|
27
27
|
# FIXME: it can easily be several table heading rows
|
28
28
|
def heading_row
|
29
|
-
rows.first if rows.first
|
29
|
+
rows.first if rows.first&.children&.all? { |c| c.is_a?(TableHeading) }
|
30
30
|
end
|
31
31
|
|
32
32
|
# For now, returns all table rows except {#heading_row}
|
33
33
|
def body_rows
|
34
|
-
if rows.first
|
35
|
-
rows[1
|
34
|
+
if rows.first&.children&.all? { |c| c.is_a?(TableHeading) }
|
35
|
+
rows[1..]
|
36
36
|
else
|
37
37
|
rows
|
38
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'linkable'
|
4
4
|
|
@@ -22,6 +22,10 @@ module Infoboxer
|
|
22
22
|
false
|
23
23
|
end
|
24
24
|
|
25
|
+
def named?
|
26
|
+
name !~ /^\d+$/
|
27
|
+
end
|
28
|
+
|
25
29
|
protected
|
26
30
|
|
27
31
|
def descr
|
@@ -110,12 +114,17 @@ module Infoboxer
|
|
110
114
|
alias_method :variables, :children
|
111
115
|
|
112
116
|
def initialize(name, variables = Nodes[])
|
113
|
-
super(variables, extract_params(variables))
|
117
|
+
super(variables, **extract_params(variables))
|
114
118
|
@name = name
|
115
119
|
end
|
116
120
|
|
117
121
|
def text
|
118
|
-
''
|
122
|
+
res = unnamed_variables.map(&:text).join('|')
|
123
|
+
res.empty? ? '' : "{#{name}:#{res}}"
|
124
|
+
end
|
125
|
+
|
126
|
+
def unwrap
|
127
|
+
unnamed_variables.flat_map(&:children).unwrap
|
119
128
|
end
|
120
129
|
|
121
130
|
# See {Node#to_tree}
|
@@ -139,7 +148,11 @@ module Infoboxer
|
|
139
148
|
#
|
140
149
|
# @return [Nodes<Var>]
|
141
150
|
def unnamed_variables
|
142
|
-
variables.
|
151
|
+
variables.reject(&:named?)
|
152
|
+
end
|
153
|
+
|
154
|
+
def named_variables
|
155
|
+
variables.select(&:named?)
|
143
156
|
end
|
144
157
|
|
145
158
|
# Fetches template variable(s) by name(s) or patterns.
|
@@ -242,7 +255,7 @@ module Infoboxer
|
|
242
255
|
def extract_params(vars)
|
243
256
|
vars
|
244
257
|
.select { |v| v.children.count == 1 && v.children.first.is_a?(Text) }
|
245
|
-
.map { |v| [v.name, v.children.first.raw_text] }.to_h
|
258
|
+
.map { |v| [v.name.to_sym, v.children.first.raw_text] }.to_h
|
246
259
|
end
|
247
260
|
|
248
261
|
def inspect_variables(depth)
|
data/lib/infoboxer/tree/text.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Tree
|
@@ -15,9 +15,9 @@ module Infoboxer
|
|
15
15
|
# Text fragment without decodint of HTML entities.
|
16
16
|
attr_accessor :raw_text
|
17
17
|
|
18
|
-
def initialize(text, params
|
19
|
-
super(params)
|
20
|
-
@raw_text = text
|
18
|
+
def initialize(text, **params)
|
19
|
+
super(**params)
|
20
|
+
@raw_text = +text
|
21
21
|
end
|
22
22
|
|
23
23
|
# See {Node#text}
|
@@ -39,13 +39,13 @@ module Infoboxer
|
|
39
39
|
# @private
|
40
40
|
# Internal, used by {Parser}
|
41
41
|
def merge!(other)
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
42
|
+
@raw_text <<
|
43
|
+
case other
|
44
|
+
when String then other
|
45
|
+
when Text then other.raw_text
|
46
|
+
else
|
47
|
+
fail("Not mergeable into text: #{other.inspect}")
|
48
|
+
end
|
49
49
|
end
|
50
50
|
|
51
51
|
# @private
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'linkable'
|
4
4
|
|
@@ -12,14 +12,23 @@ module Infoboxer
|
|
12
12
|
# Note, that Wikilink is {Linkable}, so you can {Linkable#follow #follow}
|
13
13
|
# it to obtain linked pages.
|
14
14
|
class Wikilink < Link
|
15
|
-
def initialize(
|
16
|
-
super
|
17
|
-
|
15
|
+
def initialize(link, label = nil, namespace: nil, interwiki: nil)
|
16
|
+
super(link, label, namespace: namespace, interwiki: interwiki)
|
17
|
+
@namespace = namespace || ''
|
18
|
+
@interwiki = interwiki
|
19
|
+
parse_name!
|
18
20
|
end
|
19
21
|
|
20
22
|
# "Clean" wikilink name, for ex., `Cities` for `[Category:Cities]`
|
21
23
|
attr_reader :name
|
22
24
|
|
25
|
+
# Interwiki identifier. For example, `[[wikt:Argentina]]`
|
26
|
+
# will have `"Argentina"` as its {#name} and `"wikt"` (wiktionary) as an
|
27
|
+
# interwiki. TODO: how to use it.
|
28
|
+
#
|
29
|
+
# See [Wikipedia docs](https://en.wikipedia.org/wiki/Help:Interwiki_linking) for details.
|
30
|
+
attr_reader :interwiki
|
31
|
+
|
23
32
|
# Wikilink namespace, `Category` for `[Category:Cities]`, empty
|
24
33
|
# string (not `nil`!) for just `[Cities]`
|
25
34
|
attr_reader :namespace
|
@@ -46,10 +55,8 @@ module Infoboxer
|
|
46
55
|
|
47
56
|
private
|
48
57
|
|
49
|
-
def
|
50
|
-
@name
|
51
|
-
@namespace ||= ''
|
52
|
-
|
58
|
+
def parse_name!
|
59
|
+
@name = namespace.empty? ? link : link.sub(/^#{namespace}:/, '')
|
53
60
|
@name, @anchor = @name.split('#', 2)
|
54
61
|
@anchor ||= ''
|
55
62
|
|
@@ -68,6 +75,7 @@ module Infoboxer
|
|
68
75
|
|
69
76
|
return unless children.count == 1 &&
|
70
77
|
children.first.is_a?(Text) && children.first.raw_text.empty?
|
78
|
+
|
71
79
|
children.first.raw_text = @topic
|
72
80
|
end
|
73
81
|
end
|
data/lib/infoboxer/version.rb
CHANGED
data/lib/infoboxer/wiki_path.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Infoboxer
|
2
4
|
# @private
|
3
5
|
class WikiPath
|
@@ -36,7 +38,7 @@ module Infoboxer
|
|
36
38
|
attrs[attr.to_sym] = process_value(value)
|
37
39
|
end
|
38
40
|
res = op == '//' ? {op: :lookup} : {}
|
39
|
-
res[:type] = type
|
41
|
+
res[:type] = process_type(type) unless type.empty?
|
40
42
|
res.merge(attrs) # TODO: raise if empty selector
|
41
43
|
end
|
42
44
|
|
@@ -51,6 +53,15 @@ module Infoboxer
|
|
51
53
|
end
|
52
54
|
end
|
53
55
|
|
56
|
+
def process_type(type)
|
57
|
+
type.gsub(/(?:^|_)([a-z])/, &:upcase).tr('_', '').to_sym
|
58
|
+
.tap { |t| valid_type?(t) or fail(ParseError, "Unrecognized node type: #{type}") }
|
59
|
+
end
|
60
|
+
|
61
|
+
def valid_type?(t)
|
62
|
+
t == :Section || Infoboxer::Tree.const_defined?(t)
|
63
|
+
end
|
64
|
+
|
54
65
|
def unexpected(scanner, expected)
|
55
66
|
place = scanner.eos? ? 'end of pattern' : scanner.rest.inspect
|
56
67
|
fail ParseError, "Unexpected #{place}, expecting #{expected}"
|