infoboxer 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ci.yml +32 -0
- data/.rubocop_todo.yml +0 -15
- data/CHANGELOG.md +43 -0
- data/Gemfile.lock +172 -0
- data/README.md +1 -1
- data/infoboxer.gemspec +1 -1
- data/lib/infoboxer.rb +23 -11
- data/lib/infoboxer/core_ext.rb +1 -1
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +3 -1
- data/lib/infoboxer/media_wiki.rb +83 -65
- data/lib/infoboxer/media_wiki/page.rb +10 -1
- data/lib/infoboxer/media_wiki/traits.rb +69 -22
- data/lib/infoboxer/navigation.rb +7 -1
- data/lib/infoboxer/navigation/lookup.rb +15 -7
- data/lib/infoboxer/navigation/sections.rb +27 -9
- data/lib/infoboxer/navigation/selector.rb +14 -6
- data/lib/infoboxer/navigation/shortcuts.rb +1 -1
- data/lib/infoboxer/navigation/wikipath.rb +1 -1
- data/lib/infoboxer/parser.rb +2 -2
- data/lib/infoboxer/parser/context.rb +23 -9
- data/lib/infoboxer/parser/html.rb +1 -1
- data/lib/infoboxer/parser/image.rb +2 -2
- data/lib/infoboxer/parser/inline.rb +50 -7
- data/lib/infoboxer/parser/paragraphs.rb +3 -3
- data/lib/infoboxer/parser/table.rb +33 -17
- data/lib/infoboxer/parser/template.rb +5 -4
- data/lib/infoboxer/parser/util.rb +2 -1
- data/lib/infoboxer/templates.rb +2 -0
- data/lib/infoboxer/templates/base.rb +2 -0
- data/lib/infoboxer/templates/set.rb +1 -1
- data/lib/infoboxer/tree.rb +2 -2
- data/lib/infoboxer/tree/compound.rb +3 -3
- data/lib/infoboxer/tree/document.rb +1 -1
- data/lib/infoboxer/tree/gallery.rb +12 -0
- data/lib/infoboxer/tree/html.rb +3 -3
- data/lib/infoboxer/tree/image.rb +4 -4
- data/lib/infoboxer/tree/inline.rb +3 -3
- data/lib/infoboxer/tree/linkable.rb +6 -1
- data/lib/infoboxer/tree/list.rb +4 -5
- data/lib/infoboxer/tree/math.rb +2 -3
- data/lib/infoboxer/tree/node.rb +4 -4
- data/lib/infoboxer/tree/nodes.rb +51 -7
- data/lib/infoboxer/tree/paragraphs.rb +1 -1
- data/lib/infoboxer/tree/ref.rb +1 -1
- data/lib/infoboxer/tree/table.rb +4 -4
- data/lib/infoboxer/tree/template.rb +18 -5
- data/lib/infoboxer/tree/text.rb +11 -11
- data/lib/infoboxer/tree/wikilink.rb +16 -8
- data/lib/infoboxer/version.rb +4 -3
- data/lib/infoboxer/wiki_path.rb +12 -1
- data/regression/pages/2012_bdo_world_darts_championship.wiki +941 -0
- data/regression/pages/progress_wrestling.wiki +1308 -0
- metadata +12 -8
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
class MediaWiki
|
@@ -35,6 +35,15 @@ module Infoboxer
|
|
35
35
|
client.traits
|
36
36
|
end
|
37
37
|
|
38
|
+
# FIXME: take from siteinfo!
|
39
|
+
def namespace
|
40
|
+
Traits::STANDARD_NAMESPACES[source.fetch('ns') + 2] # Media = -2, Specia = -1, Main = 0
|
41
|
+
end
|
42
|
+
|
43
|
+
def category?
|
44
|
+
namespace == 'Category'
|
45
|
+
end
|
46
|
+
|
38
47
|
private
|
39
48
|
|
40
49
|
PARAMS_TO_INSPECT = %i[url title].freeze
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
class MediaWiki
|
@@ -34,9 +34,8 @@ module Infoboxer
|
|
34
34
|
end
|
35
35
|
|
36
36
|
# @private
|
37
|
-
def get(domain,
|
38
|
-
|
39
|
-
cls ? cls.new(options) : Traits.new(options)
|
37
|
+
def get(domain, site_info = {})
|
38
|
+
(Traits.domains[domain] || Traits).new(site_info)
|
40
39
|
end
|
41
40
|
|
42
41
|
# @private
|
@@ -60,7 +59,7 @@ module Infoboxer
|
|
60
59
|
# [English Wikipedia traits](https://github.com/molybdenum-99/infoboxer/blob/master/lib/infoboxer/definitions/en.wikipedia.org.rb)
|
61
60
|
# for example implementation.
|
62
61
|
def for(domain, &block)
|
63
|
-
Traits.domains[domain]
|
62
|
+
Traits.domains[domain]&.instance_eval(&block) ||
|
64
63
|
Class.new(self, &block).domain(domain)
|
65
64
|
end
|
66
65
|
|
@@ -68,18 +67,27 @@ module Infoboxer
|
|
68
67
|
alias_method :default, :new
|
69
68
|
end
|
70
69
|
|
71
|
-
def initialize(
|
72
|
-
@
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
70
|
+
def initialize(site_info = {})
|
71
|
+
@site_info = site_info
|
72
|
+
end
|
73
|
+
|
74
|
+
def namespace?(prefix)
|
75
|
+
known_namespaces.include?(prefix)
|
76
|
+
end
|
77
|
+
|
78
|
+
def interwiki?(prefix)
|
79
|
+
known_interwikis.key?(prefix)
|
79
80
|
end
|
80
81
|
|
81
82
|
# @private
|
82
|
-
|
83
|
+
def file_namespace
|
84
|
+
@file_namespace ||= ns_aliases('File')
|
85
|
+
end
|
86
|
+
|
87
|
+
# @private
|
88
|
+
def category_namespace
|
89
|
+
@category_namespace ||= ns_aliases('Category')
|
90
|
+
end
|
83
91
|
|
84
92
|
# @private
|
85
93
|
def templates
|
@@ -88,16 +96,55 @@ module Infoboxer
|
|
88
96
|
|
89
97
|
private
|
90
98
|
|
91
|
-
def
|
92
|
-
|
93
|
-
|
94
|
-
|
99
|
+
def known_namespaces
|
100
|
+
@known_namespaces ||=
|
101
|
+
if @site_info.empty?
|
102
|
+
STANDARD_NAMESPACES
|
103
|
+
else
|
104
|
+
(@site_info['namespaces'].values + @site_info['namespacealiases']).map { |n| n['*'] }
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def known_interwikis
|
109
|
+
@known_interwikis ||=
|
110
|
+
if @site_info.empty?
|
111
|
+
{}
|
112
|
+
else
|
113
|
+
@site_info['interwikimap'].map { |iw| [iw['prefix'], iw] }.to_h
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def ns_aliases(base)
|
118
|
+
return [base] if @site_info.empty?
|
119
|
+
|
120
|
+
main = @site_info['namespaces'].values.detect { |n| n['canonical'] == base }
|
121
|
+
[base, main['*']] +
|
122
|
+
@site_info['namespacealiases']
|
123
|
+
.select { |a| a['id'] == main['id'] }.flat_map { |n| n['*'] }
|
124
|
+
.compact.uniq
|
95
125
|
end
|
96
126
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
127
|
+
# See https://www.mediawiki.org/wiki/Help:Namespaces#Standard_namespaces
|
128
|
+
STANDARD_NAMESPACES = [
|
129
|
+
'Media', # Direct linking to media files.
|
130
|
+
'Special', # Special (non-editable) pages.
|
131
|
+
'', # (Main)
|
132
|
+
'Talk', # Article discussion.
|
133
|
+
'User', #
|
134
|
+
'User talk', #
|
135
|
+
'Project', # Meta-discussions related to the operation and development of the wiki.
|
136
|
+
'Project talk', #
|
137
|
+
'File', # Metadata for images, videos, sound files and other media.
|
138
|
+
'File talk', #
|
139
|
+
'MediaWiki', # System messages and other important content.
|
140
|
+
'MediaWiki talk', #
|
141
|
+
'Template', # Templates: blocks of text or wikicode that are intended to be transcluded.
|
142
|
+
'Template talk', #
|
143
|
+
'Help', # Help files, instructions and "how-to" guides.
|
144
|
+
'Help talk', #
|
145
|
+
'Category', # Categories: dynamic lists of other pages.
|
146
|
+
'Category talk', #
|
147
|
+
].freeze
|
101
148
|
end
|
102
149
|
end
|
103
150
|
end
|
data/lib/infoboxer/navigation.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
# Navigation is one of the things Infoboxer is proud about. It tries
|
@@ -96,5 +96,11 @@ module Infoboxer
|
|
96
96
|
class Tree::Document
|
97
97
|
include Navigation::Sections::Container
|
98
98
|
end
|
99
|
+
|
100
|
+
module Helpers
|
101
|
+
def W(*arg, &block) # rubocop:disable Naming/MethodName
|
102
|
+
Lookup::Selector.new(*arg, &block)
|
103
|
+
end
|
104
|
+
end
|
99
105
|
end
|
100
106
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'selector'
|
4
4
|
|
@@ -98,9 +98,13 @@ module Infoboxer
|
|
98
98
|
# Selects matching nodes from current node's siblings, which
|
99
99
|
# are above current node in parents children list.
|
100
100
|
|
101
|
+
# @!method lookup_prev_sibling(*selectors, &block)
|
102
|
+
# Selects first matching nodes from current node's siblings, which
|
103
|
+
# are above current node in parents children list.
|
104
|
+
|
101
105
|
# Underscored version of {#matches?}
|
102
106
|
def _matches?(selector)
|
103
|
-
selector
|
107
|
+
selector === self
|
104
108
|
end
|
105
109
|
|
106
110
|
# Underscored version of {#lookup}
|
@@ -136,6 +140,11 @@ module Infoboxer
|
|
136
140
|
prev_siblings._find(selector)
|
137
141
|
end
|
138
142
|
|
143
|
+
# Underscored version of {#lookup_prev_sibling}
|
144
|
+
def _lookup_prev_sibling(selector)
|
145
|
+
prev_siblings.reverse.detect { |n| selector === n }
|
146
|
+
end
|
147
|
+
|
139
148
|
# Underscored version of {#lookup_next_siblings}
|
140
149
|
def _lookup_next_siblings(selector)
|
141
150
|
next_siblings._find(selector)
|
@@ -146,14 +155,14 @@ module Infoboxer
|
|
146
155
|
lookup lookup_children lookup_parents
|
147
156
|
lookup_siblings
|
148
157
|
lookup_next_siblings lookup_prev_siblings
|
158
|
+
lookup_prev_sibling
|
149
159
|
]
|
150
160
|
.map { |sym| [sym, :"_#{sym}"] }
|
151
161
|
.each do |sym, underscored|
|
152
|
-
|
153
|
-
|
154
|
-
|
162
|
+
define_method(sym) do |*args, &block|
|
163
|
+
send(underscored, Selector.new(*args, &block))
|
164
|
+
end
|
155
165
|
end
|
156
|
-
end
|
157
166
|
|
158
167
|
# Checks if node has any parent matching selectors.
|
159
168
|
def parent?(*selectors, &block)
|
@@ -209,7 +218,6 @@ module Infoboxer
|
|
209
218
|
lookup_siblings
|
210
219
|
lookup_next_siblings lookup_prev_siblings
|
211
220
|
].map { |sym| [sym, :"_#{sym}"] }.each do |sym, underscored|
|
212
|
-
|
213
221
|
define_method(sym) do |*args, &block|
|
214
222
|
send(underscored, Selector.new(*args, &block))
|
215
223
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Navigation
|
@@ -77,10 +77,23 @@ module Infoboxer
|
|
77
77
|
when 1
|
78
78
|
@sections.select { |s| names.first === s.heading.text_ }
|
79
79
|
else
|
80
|
-
@sections.select { |s| names.first === s.heading.text_ }.sections(*names[1
|
80
|
+
@sections.select { |s| names.first === s.heading.text_ }.sections(*names[1..])
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
+
def subsections(*names)
|
85
|
+
sections = names.map { |name|
|
86
|
+
heading = lookup_children(:Heading, text_: name).first
|
87
|
+
next unless heading
|
88
|
+
|
89
|
+
body = heading.next_siblings
|
90
|
+
.take_while { |n| !n.is_a?(Tree::Heading) || n.level > heading.level }
|
91
|
+
|
92
|
+
Section.new(heading, body)
|
93
|
+
}.compact
|
94
|
+
Tree::Nodes.new(sections)
|
95
|
+
end
|
96
|
+
|
84
97
|
def lookup_children(*arg)
|
85
98
|
if arg.include?(:Section)
|
86
99
|
sections.find(*(arg - [:Section]))
|
@@ -94,6 +107,7 @@ module Infoboxer
|
|
94
107
|
def make_sections
|
95
108
|
res = Tree::Nodes[]
|
96
109
|
return res if headings.empty?
|
110
|
+
|
97
111
|
level = headings.first.level
|
98
112
|
|
99
113
|
children
|
@@ -123,21 +137,25 @@ module Infoboxer
|
|
123
137
|
#
|
124
138
|
# @return {Tree::Nodes<Section>}
|
125
139
|
def in_sections
|
126
|
-
|
140
|
+
return parent.in_sections unless parent.is_a?(Tree::Document)
|
141
|
+
return @in_sections if @in_sections
|
127
142
|
|
128
143
|
heading =
|
129
|
-
if
|
130
|
-
|
144
|
+
if is_a?(Tree::Heading)
|
145
|
+
lookup_prev_sibling(Tree::Heading, level: level - 1)
|
131
146
|
else
|
132
|
-
|
147
|
+
lookup_prev_sibling(Tree::Heading)
|
133
148
|
end
|
134
|
-
|
149
|
+
unless heading
|
150
|
+
@in_sections = Tree::Nodes[]
|
151
|
+
return @in_sections
|
152
|
+
end
|
135
153
|
|
136
154
|
body = heading.next_siblings
|
137
|
-
.take_while { |n| !n.is_a?(Tree::Heading) || n.level
|
155
|
+
.take_while { |n| !n.is_a?(Tree::Heading) || n.level > heading.level }
|
138
156
|
|
139
157
|
section = Section.new(heading, body)
|
140
|
-
Tree::Nodes[section, *heading.in_sections]
|
158
|
+
@in_sections = Tree::Nodes[section, *heading.in_sections]
|
141
159
|
end
|
142
160
|
end
|
143
161
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Infoboxer
|
4
4
|
module Navigation
|
@@ -10,7 +10,7 @@ module Infoboxer
|
|
10
10
|
def initialize(*arg, &block)
|
11
11
|
@arg = [arg, block].flatten.compact.map(&method(:sym_to_class))
|
12
12
|
@arg.each do |a|
|
13
|
-
a.
|
13
|
+
a.compact! if a.is_a?(Hash)
|
14
14
|
end
|
15
15
|
end
|
16
16
|
|
@@ -24,8 +24,8 @@ module Infoboxer
|
|
24
24
|
"#<Selector(#{@arg.map(&:to_s).join(', ')})>"
|
25
25
|
end
|
26
26
|
|
27
|
-
def
|
28
|
-
@arg.all? { |a| arg_matches?(a,
|
27
|
+
def ===(other)
|
28
|
+
@arg.all? { |a| arg_matches?(a, other) }
|
29
29
|
end
|
30
30
|
|
31
31
|
private
|
@@ -44,8 +44,8 @@ module Infoboxer
|
|
44
44
|
check.call(node)
|
45
45
|
when Hash
|
46
46
|
check.all? { |attr, value|
|
47
|
-
node.respond_to?(attr) && value
|
48
|
-
node.params.key?(attr) && value
|
47
|
+
node.respond_to?(attr) && value_matches?(value, node.send(attr)) ||
|
48
|
+
node.params.key?(attr) && value_matches?(value, node.params[attr])
|
49
49
|
}
|
50
50
|
when Symbol
|
51
51
|
node.respond_to?(check) && node.send(check)
|
@@ -53,6 +53,14 @@ module Infoboxer
|
|
53
53
|
check === node
|
54
54
|
end
|
55
55
|
end
|
56
|
+
|
57
|
+
def value_matches?(matcher, value)
|
58
|
+
if matcher.is_a?(String) && value.is_a?(String)
|
59
|
+
matcher.casecmp(value).zero?
|
60
|
+
else
|
61
|
+
matcher === value
|
62
|
+
end
|
63
|
+
end
|
56
64
|
end
|
57
65
|
end
|
58
66
|
end
|
data/lib/infoboxer/parser.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'ostruct'
|
4
4
|
require 'logger'
|
@@ -54,7 +54,7 @@ module Infoboxer
|
|
54
54
|
def initialize(context)
|
55
55
|
@context = context
|
56
56
|
@re = OpenStruct.new(make_regexps)
|
57
|
-
@logger = Logger.new(
|
57
|
+
@logger = Logger.new($stdout).tap { |l| l.level = Logger::FATAL }
|
58
58
|
end
|
59
59
|
|
60
60
|
require_relative 'parser/inline'
|
@@ -1,16 +1,16 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'strscan'
|
4
4
|
|
5
5
|
module Infoboxer
|
6
6
|
class Parser
|
7
|
-
class Context
|
7
|
+
class Context # rubocop:disable Metrics/ClassLength
|
8
8
|
attr_reader :lineno
|
9
9
|
attr_reader :traits
|
10
10
|
|
11
11
|
def initialize(text, traits = nil)
|
12
12
|
@lines = text
|
13
|
-
.gsub(
|
13
|
+
.gsub(/<!--.*?-->/m, '') # FIXME: will also kill comments inside <nowiki> tag
|
14
14
|
.split(/[\r\n]/)
|
15
15
|
@lineno = -1
|
16
16
|
@traits = traits || MediaWiki::Traits.default
|
@@ -21,22 +21,23 @@ module Infoboxer
|
|
21
21
|
attr_reader :next_lines
|
22
22
|
|
23
23
|
def colno
|
24
|
-
@scanner
|
24
|
+
@scanner&.pos || 0
|
25
25
|
end
|
26
26
|
|
27
27
|
def matched
|
28
|
-
@matched ||= @scanner
|
28
|
+
@matched ||= @scanner&.matched
|
29
29
|
end
|
30
30
|
|
31
31
|
# check which works only once
|
32
32
|
def eat_matched?(str)
|
33
33
|
return false unless matched == str
|
34
|
+
|
34
35
|
@matched = 'DUMMY'
|
35
36
|
true
|
36
37
|
end
|
37
38
|
|
38
39
|
def rest
|
39
|
-
@rest ||= @scanner
|
40
|
+
@rest ||= @scanner&.rest
|
40
41
|
end
|
41
42
|
|
42
43
|
alias_method :current, :rest
|
@@ -109,7 +110,7 @@ module Infoboxer
|
|
109
110
|
end
|
110
111
|
|
111
112
|
def scan_continued_until(re, leave_pattern = false)
|
112
|
-
res = ''
|
113
|
+
res = +''
|
113
114
|
|
114
115
|
loop do
|
115
116
|
chunk = _scan_until(re)
|
@@ -130,7 +131,13 @@ module Infoboxer
|
|
130
131
|
|
131
132
|
# state inspection
|
132
133
|
def matched_inline?(re)
|
133
|
-
re.nil?
|
134
|
+
if re.nil?
|
135
|
+
matched.empty? && eol?
|
136
|
+
elsif re.inspect.start_with?('/^') # was it REALLY at the beginning of the line?..
|
137
|
+
@scanner.pos == matched.length && matched =~ re
|
138
|
+
else
|
139
|
+
matched =~ re
|
140
|
+
end
|
134
141
|
end
|
135
142
|
|
136
143
|
def matched?(re)
|
@@ -146,6 +153,13 @@ module Infoboxer
|
|
146
153
|
fail(ParsingError, "#{text} at line #{@lineno}:\n\t#{current}")
|
147
154
|
end
|
148
155
|
|
156
|
+
def unscan_matched!
|
157
|
+
return unless @matched
|
158
|
+
|
159
|
+
@scanner.pos -= @matched.size
|
160
|
+
@rest = nil
|
161
|
+
end
|
162
|
+
|
149
163
|
private
|
150
164
|
|
151
165
|
# we do hard use of #matched and #rest, its wiser to memoize them
|
@@ -163,7 +177,7 @@ module Infoboxer
|
|
163
177
|
def shift(amount)
|
164
178
|
@lineno += amount
|
165
179
|
current = @lines[lineno]
|
166
|
-
@next_lines = @lines[(lineno + 1)
|
180
|
+
@next_lines = @lines[(lineno + 1)..]
|
167
181
|
if current
|
168
182
|
@scanner.string = current
|
169
183
|
@rest = current
|