infoboxer 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ci.yml +32 -0
- data/.rubocop_todo.yml +0 -15
- data/CHANGELOG.md +43 -0
- data/Gemfile.lock +172 -0
- data/README.md +1 -1
- data/infoboxer.gemspec +1 -1
- data/lib/infoboxer.rb +23 -11
- data/lib/infoboxer/core_ext.rb +1 -1
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +3 -1
- data/lib/infoboxer/media_wiki.rb +83 -65
- data/lib/infoboxer/media_wiki/page.rb +10 -1
- data/lib/infoboxer/media_wiki/traits.rb +69 -22
- data/lib/infoboxer/navigation.rb +7 -1
- data/lib/infoboxer/navigation/lookup.rb +15 -7
- data/lib/infoboxer/navigation/sections.rb +27 -9
- data/lib/infoboxer/navigation/selector.rb +14 -6
- data/lib/infoboxer/navigation/shortcuts.rb +1 -1
- data/lib/infoboxer/navigation/wikipath.rb +1 -1
- data/lib/infoboxer/parser.rb +2 -2
- data/lib/infoboxer/parser/context.rb +23 -9
- data/lib/infoboxer/parser/html.rb +1 -1
- data/lib/infoboxer/parser/image.rb +2 -2
- data/lib/infoboxer/parser/inline.rb +50 -7
- data/lib/infoboxer/parser/paragraphs.rb +3 -3
- data/lib/infoboxer/parser/table.rb +33 -17
- data/lib/infoboxer/parser/template.rb +5 -4
- data/lib/infoboxer/parser/util.rb +2 -1
- data/lib/infoboxer/templates.rb +2 -0
- data/lib/infoboxer/templates/base.rb +2 -0
- data/lib/infoboxer/templates/set.rb +1 -1
- data/lib/infoboxer/tree.rb +2 -2
- data/lib/infoboxer/tree/compound.rb +3 -3
- data/lib/infoboxer/tree/document.rb +1 -1
- data/lib/infoboxer/tree/gallery.rb +12 -0
- data/lib/infoboxer/tree/html.rb +3 -3
- data/lib/infoboxer/tree/image.rb +4 -4
- data/lib/infoboxer/tree/inline.rb +3 -3
- data/lib/infoboxer/tree/linkable.rb +6 -1
- data/lib/infoboxer/tree/list.rb +4 -5
- data/lib/infoboxer/tree/math.rb +2 -3
- data/lib/infoboxer/tree/node.rb +4 -4
- data/lib/infoboxer/tree/nodes.rb +51 -7
- data/lib/infoboxer/tree/paragraphs.rb +1 -1
- data/lib/infoboxer/tree/ref.rb +1 -1
- data/lib/infoboxer/tree/table.rb +4 -4
- data/lib/infoboxer/tree/template.rb +18 -5
- data/lib/infoboxer/tree/text.rb +11 -11
- data/lib/infoboxer/tree/wikilink.rb +16 -8
- data/lib/infoboxer/version.rb +4 -3
- data/lib/infoboxer/wiki_path.rb +12 -1
- data/regression/pages/2012_bdo_world_darts_championship.wiki +941 -0
- data/regression/pages/progress_wrestling.wiki +1308 -0
- metadata +12 -8
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Infoboxer
|
2
4
|
module Tree
|
3
5
|
# Module included into everything, that can be treated as
|
@@ -15,7 +17,7 @@ module Infoboxer
|
|
15
17
|
# * {Tree::Nodes#follow} for extracting multiple links at once;
|
16
18
|
# * {MediaWiki#get} for basic information on page extraction.
|
17
19
|
def follow
|
18
|
-
client.get(link)
|
20
|
+
client.get(link, interwiki: interwiki)
|
19
21
|
end
|
20
22
|
|
21
23
|
# Human-readable page URL
|
@@ -28,6 +30,9 @@ module Infoboxer
|
|
28
30
|
|
29
31
|
protected
|
30
32
|
|
33
|
+
# redefined in {Wikilink}
|
34
|
+
def interwiki; end
|
35
|
+
|
31
36
|
def page
|
32
37
|
lookup_parents(MediaWiki::Page).first or fail('Not in a page from real source')
|
33
38
|
end
|
data/lib/infoboxer/tree/list.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Tree
|
@@ -15,9 +15,8 @@ module Infoboxer
|
|
15
15
|
# Internal, used by {Parser}
|
16
16
|
def merge!(other)
|
17
17
|
ochildren = other.children.dup
|
18
|
-
|
19
|
-
children.last
|
20
|
-
end
|
18
|
+
children.last.merge!(ochildren.shift) \
|
19
|
+
if children.last&.can_merge?(ochildren.first)
|
21
20
|
push_children(*ochildren)
|
22
21
|
end
|
23
22
|
|
@@ -81,7 +80,7 @@ module Infoboxer
|
|
81
80
|
# Represents ordered list (list with numbers).
|
82
81
|
class OrderedList < List
|
83
82
|
def make_marker(item)
|
84
|
-
list_text_indent + "#{
|
83
|
+
list_text_indent + "#{item.index + 1}. "
|
85
84
|
end
|
86
85
|
end
|
87
86
|
|
data/lib/infoboxer/tree/math.rb
CHANGED
@@ -1,12 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Infoboxer
|
2
4
|
module Tree
|
3
5
|
# Represents node of math formulae marked with TeX
|
4
6
|
#
|
5
7
|
# See also: https://en.wikipedia.org/wiki/Help:Displaying_a_formula
|
6
8
|
class Math < Text
|
7
|
-
def text
|
8
|
-
"<math>#{super}</math>"
|
9
|
-
end
|
10
9
|
end
|
11
10
|
end
|
12
11
|
end
|
data/lib/infoboxer/tree/node.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'htmlentities'
|
4
4
|
|
@@ -11,7 +11,7 @@ module Infoboxer
|
|
11
11
|
# you will receive it from tree and use for navigations.
|
12
12
|
#
|
13
13
|
class Node
|
14
|
-
def initialize(params
|
14
|
+
def initialize(**params)
|
15
15
|
@params = params
|
16
16
|
end
|
17
17
|
|
@@ -154,7 +154,7 @@ module Infoboxer
|
|
154
154
|
end
|
155
155
|
|
156
156
|
def show_params(prms = nil)
|
157
|
-
(prms || params).map { |k, v| "#{k}: #{v.inspect}" }.join(', ')
|
157
|
+
(prms || params).compact.map { |k, v| "#{k}: #{v.inspect}" }.join(', ')
|
158
158
|
end
|
159
159
|
|
160
160
|
def indent(level)
|
@@ -162,7 +162,7 @@ module Infoboxer
|
|
162
162
|
end
|
163
163
|
|
164
164
|
def _eq(_other)
|
165
|
-
|
165
|
+
false
|
166
166
|
end
|
167
167
|
|
168
168
|
def decode(str)
|
data/lib/infoboxer/tree/nodes.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Tree
|
@@ -38,15 +38,32 @@ module Infoboxer
|
|
38
38
|
# @!method compact
|
39
39
|
# Just like Array#compact, but returns Nodes
|
40
40
|
|
41
|
+
# @!method grep(pattern)
|
42
|
+
# Just like Array#grep, but returns Nodes
|
43
|
+
|
44
|
+
# @!method grep_v(pattern)
|
45
|
+
# Just like Array#grep_v, but returns Nodes
|
46
|
+
|
41
47
|
# @!method -(other)
|
42
48
|
# Just like Array#-, but returns Nodes
|
43
49
|
|
44
|
-
|
50
|
+
# @!method +(other)
|
51
|
+
# Just like Array#+, but returns Nodes
|
52
|
+
|
53
|
+
# NB: Since Ruby 3.0, we need to redefine all Enumerable methods (otherwise they return Array).
|
54
|
+
# TODO: Check those lacking overrides!
|
55
|
+
|
56
|
+
%i[
|
57
|
+
select reject sort_by flatten compact grep grep_v - +
|
58
|
+
take_while drop_while
|
59
|
+
].each do |sym|
|
45
60
|
define_method(sym) do |*args, &block|
|
46
61
|
Nodes[*super(*args, &block)]
|
47
62
|
end
|
48
63
|
end
|
49
64
|
|
65
|
+
alias_method :filter, :select
|
66
|
+
|
50
67
|
# Just like Array#first, but returns Nodes, if provided with `n` of elements.
|
51
68
|
def first(n = nil)
|
52
69
|
if n.nil?
|
@@ -75,6 +92,21 @@ module Infoboxer
|
|
75
92
|
end
|
76
93
|
end
|
77
94
|
|
95
|
+
# Just like Array#flat_map, but returns Nodes, **if** all map results are Node
|
96
|
+
def flat_map
|
97
|
+
res = super
|
98
|
+
if res.all? { |n| n.is_a?(Node) || n.is_a?(Nodes) }
|
99
|
+
Nodes[*res]
|
100
|
+
else
|
101
|
+
res
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Just like Array#group, but returns hash with `{<grouping variable> => Nodes}`
|
106
|
+
def group_by
|
107
|
+
super.transform_values { |group| Nodes[*group] }
|
108
|
+
end
|
109
|
+
|
78
110
|
# @!method prev_siblings
|
79
111
|
# Previous siblings (flat list) of all nodes inside.
|
80
112
|
|
@@ -129,6 +161,12 @@ module Infoboxer
|
|
129
161
|
map(&:text).join
|
130
162
|
end
|
131
163
|
|
164
|
+
alias_method :to_s, :text
|
165
|
+
|
166
|
+
def unwrap
|
167
|
+
map { |n| n.respond_to?(:unwrap) ? n.unwrap : n }
|
168
|
+
end
|
169
|
+
|
132
170
|
# Fetches pages by ALL wikilinks inside in ONE query to MediaWiki
|
133
171
|
# API.
|
134
172
|
#
|
@@ -139,23 +177,27 @@ module Infoboxer
|
|
139
177
|
# @return [Nodes<MediaWiki::Page>] It is still `Nodes`, so you
|
140
178
|
# still can process them uniformely.
|
141
179
|
def follow
|
142
|
-
links =
|
180
|
+
links = grep(Linkable)
|
143
181
|
return Nodes[] if links.empty?
|
182
|
+
|
144
183
|
page = first.lookup_parents(MediaWiki::Page).first or
|
145
184
|
fail('Not in a page from real source')
|
146
185
|
page.client or fail('MediaWiki client not set')
|
147
|
-
|
186
|
+
pages = links.group_by(&:interwiki)
|
187
|
+
.flat_map { |iw, ls| page.client.get(*ls.map(&:link), interwiki: iw) }
|
188
|
+
pages.count == 1 ? pages.first : Nodes[*pages]
|
148
189
|
end
|
149
190
|
|
150
191
|
# @private
|
151
192
|
# Internal, used by {Parser}
|
152
|
-
def <<(node)
|
193
|
+
def <<(node) # rubocop:disable Metrics/PerceivedComplexity
|
153
194
|
if node.is_a?(Array)
|
154
195
|
node.each { |n| self << n }
|
155
|
-
elsif last
|
196
|
+
elsif last&.can_merge?(node)
|
156
197
|
last.merge!(node)
|
157
198
|
else
|
158
199
|
return if !node || node.empty?
|
200
|
+
|
159
201
|
node = Text.new(node) if node.is_a?(String)
|
160
202
|
super
|
161
203
|
end
|
@@ -173,7 +215,9 @@ module Infoboxer
|
|
173
215
|
# @private
|
174
216
|
# Internal, used by {Parser}
|
175
217
|
def flow_templates
|
176
|
-
|
218
|
+
# TODO: will it be better?..
|
219
|
+
# make_nodes(map { |n| n.is_a?(Paragraph) ? n.to_templates? : n })
|
220
|
+
self
|
177
221
|
end
|
178
222
|
|
179
223
|
private
|
data/lib/infoboxer/tree/ref.rb
CHANGED
data/lib/infoboxer/tree/table.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'terminal-table'
|
4
4
|
|
@@ -26,13 +26,13 @@ module Infoboxer
|
|
26
26
|
#
|
27
27
|
# FIXME: it can easily be several table heading rows
|
28
28
|
def heading_row
|
29
|
-
rows.first if rows.first
|
29
|
+
rows.first if rows.first&.children&.all? { |c| c.is_a?(TableHeading) }
|
30
30
|
end
|
31
31
|
|
32
32
|
# For now, returns all table rows except {#heading_row}
|
33
33
|
def body_rows
|
34
|
-
if rows.first
|
35
|
-
rows[1
|
34
|
+
if rows.first&.children&.all? { |c| c.is_a?(TableHeading) }
|
35
|
+
rows[1..]
|
36
36
|
else
|
37
37
|
rows
|
38
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'linkable'
|
4
4
|
|
@@ -22,6 +22,10 @@ module Infoboxer
|
|
22
22
|
false
|
23
23
|
end
|
24
24
|
|
25
|
+
def named?
|
26
|
+
name !~ /^\d+$/
|
27
|
+
end
|
28
|
+
|
25
29
|
protected
|
26
30
|
|
27
31
|
def descr
|
@@ -110,12 +114,17 @@ module Infoboxer
|
|
110
114
|
alias_method :variables, :children
|
111
115
|
|
112
116
|
def initialize(name, variables = Nodes[])
|
113
|
-
super(variables, extract_params(variables))
|
117
|
+
super(variables, **extract_params(variables))
|
114
118
|
@name = name
|
115
119
|
end
|
116
120
|
|
117
121
|
def text
|
118
|
-
''
|
122
|
+
res = unnamed_variables.map(&:text).join('|')
|
123
|
+
res.empty? ? '' : "{#{name}:#{res}}"
|
124
|
+
end
|
125
|
+
|
126
|
+
def unwrap
|
127
|
+
unnamed_variables.flat_map(&:children).unwrap
|
119
128
|
end
|
120
129
|
|
121
130
|
# See {Node#to_tree}
|
@@ -139,7 +148,11 @@ module Infoboxer
|
|
139
148
|
#
|
140
149
|
# @return [Nodes<Var>]
|
141
150
|
def unnamed_variables
|
142
|
-
variables.
|
151
|
+
variables.reject(&:named?)
|
152
|
+
end
|
153
|
+
|
154
|
+
def named_variables
|
155
|
+
variables.select(&:named?)
|
143
156
|
end
|
144
157
|
|
145
158
|
# Fetches template variable(s) by name(s) or patterns.
|
@@ -242,7 +255,7 @@ module Infoboxer
|
|
242
255
|
def extract_params(vars)
|
243
256
|
vars
|
244
257
|
.select { |v| v.children.count == 1 && v.children.first.is_a?(Text) }
|
245
|
-
.map { |v| [v.name, v.children.first.raw_text] }.to_h
|
258
|
+
.map { |v| [v.name.to_sym, v.children.first.raw_text] }.to_h
|
246
259
|
end
|
247
260
|
|
248
261
|
def inspect_variables(depth)
|
data/lib/infoboxer/tree/text.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Tree
|
@@ -15,9 +15,9 @@ module Infoboxer
|
|
15
15
|
# Text fragment without decodint of HTML entities.
|
16
16
|
attr_accessor :raw_text
|
17
17
|
|
18
|
-
def initialize(text, params
|
19
|
-
super(params)
|
20
|
-
@raw_text = text
|
18
|
+
def initialize(text, **params)
|
19
|
+
super(**params)
|
20
|
+
@raw_text = +text
|
21
21
|
end
|
22
22
|
|
23
23
|
# See {Node#text}
|
@@ -39,13 +39,13 @@ module Infoboxer
|
|
39
39
|
# @private
|
40
40
|
# Internal, used by {Parser}
|
41
41
|
def merge!(other)
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
42
|
+
@raw_text <<
|
43
|
+
case other
|
44
|
+
when String then other
|
45
|
+
when Text then other.raw_text
|
46
|
+
else
|
47
|
+
fail("Not mergeable into text: #{other.inspect}")
|
48
|
+
end
|
49
49
|
end
|
50
50
|
|
51
51
|
# @private
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'linkable'
|
4
4
|
|
@@ -12,14 +12,23 @@ module Infoboxer
|
|
12
12
|
# Note, that Wikilink is {Linkable}, so you can {Linkable#follow #follow}
|
13
13
|
# it to obtain linked pages.
|
14
14
|
class Wikilink < Link
|
15
|
-
def initialize(
|
16
|
-
super
|
17
|
-
|
15
|
+
def initialize(link, label = nil, namespace: nil, interwiki: nil)
|
16
|
+
super(link, label, namespace: namespace, interwiki: interwiki)
|
17
|
+
@namespace = namespace || ''
|
18
|
+
@interwiki = interwiki
|
19
|
+
parse_name!
|
18
20
|
end
|
19
21
|
|
20
22
|
# "Clean" wikilink name, for ex., `Cities` for `[Category:Cities]`
|
21
23
|
attr_reader :name
|
22
24
|
|
25
|
+
# Interwiki identifier. For example, `[[wikt:Argentina]]`
|
26
|
+
# will have `"Argentina"` as its {#name} and `"wikt"` (wiktionary) as an
|
27
|
+
# interwiki. TODO: how to use it.
|
28
|
+
#
|
29
|
+
# See [Wikipedia docs](https://en.wikipedia.org/wiki/Help:Interwiki_linking) for details.
|
30
|
+
attr_reader :interwiki
|
31
|
+
|
23
32
|
# Wikilink namespace, `Category` for `[Category:Cities]`, empty
|
24
33
|
# string (not `nil`!) for just `[Cities]`
|
25
34
|
attr_reader :namespace
|
@@ -46,10 +55,8 @@ module Infoboxer
|
|
46
55
|
|
47
56
|
private
|
48
57
|
|
49
|
-
def
|
50
|
-
@name
|
51
|
-
@namespace ||= ''
|
52
|
-
|
58
|
+
def parse_name!
|
59
|
+
@name = namespace.empty? ? link : link.sub(/^#{namespace}:/, '')
|
53
60
|
@name, @anchor = @name.split('#', 2)
|
54
61
|
@anchor ||= ''
|
55
62
|
|
@@ -68,6 +75,7 @@ module Infoboxer
|
|
68
75
|
|
69
76
|
return unless children.count == 1 &&
|
70
77
|
children.first.is_a?(Text) && children.first.raw_text.empty?
|
78
|
+
|
71
79
|
children.first.raw_text = @topic
|
72
80
|
end
|
73
81
|
end
|
data/lib/infoboxer/version.rb
CHANGED
data/lib/infoboxer/wiki_path.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Infoboxer
|
2
4
|
# @private
|
3
5
|
class WikiPath
|
@@ -36,7 +38,7 @@ module Infoboxer
|
|
36
38
|
attrs[attr.to_sym] = process_value(value)
|
37
39
|
end
|
38
40
|
res = op == '//' ? {op: :lookup} : {}
|
39
|
-
res[:type] = type
|
41
|
+
res[:type] = process_type(type) unless type.empty?
|
40
42
|
res.merge(attrs) # TODO: raise if empty selector
|
41
43
|
end
|
42
44
|
|
@@ -51,6 +53,15 @@ module Infoboxer
|
|
51
53
|
end
|
52
54
|
end
|
53
55
|
|
56
|
+
def process_type(type)
|
57
|
+
type.gsub(/(?:^|_)([a-z])/, &:upcase).tr('_', '').to_sym
|
58
|
+
.tap { |t| valid_type?(t) or fail(ParseError, "Unrecognized node type: #{type}") }
|
59
|
+
end
|
60
|
+
|
61
|
+
def valid_type?(t)
|
62
|
+
t == :Section || Infoboxer::Tree.const_defined?(t)
|
63
|
+
end
|
64
|
+
|
54
65
|
def unexpected(scanner, expected)
|
55
66
|
place = scanner.eos? ? 'end of pattern' : scanner.rest.inspect
|
56
67
|
fail ParseError, "Unexpected #{place}, expecting #{expected}"
|