infoboxer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.dokaz +1 -0
- data/.yardopts +1 -0
- data/LICENSE.txt +22 -0
- data/Parsing.md +33 -0
- data/README.md +115 -0
- data/examples/output/.gitkeep +0 -0
- data/examples/pages/argentina.wiki +808 -0
- data/examples/to_text.rb +8 -0
- data/examples/tree.rb +8 -0
- data/infoboxer.gemspec +43 -0
- data/lib/infoboxer.rb +196 -0
- data/lib/infoboxer/core_ext.rb +10 -0
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
- data/lib/infoboxer/media_wiki.rb +162 -0
- data/lib/infoboxer/media_wiki/page.rb +38 -0
- data/lib/infoboxer/media_wiki/traits.rb +60 -0
- data/lib/infoboxer/navigation.rb +84 -0
- data/lib/infoboxer/navigation/lookup.rb +216 -0
- data/lib/infoboxer/navigation/sections.rb +179 -0
- data/lib/infoboxer/navigation/selector.rb +59 -0
- data/lib/infoboxer/navigation/shortcuts.rb +165 -0
- data/lib/infoboxer/parser.rb +71 -0
- data/lib/infoboxer/parser/context.rb +165 -0
- data/lib/infoboxer/parser/html.rb +58 -0
- data/lib/infoboxer/parser/image.rb +59 -0
- data/lib/infoboxer/parser/inline.rb +142 -0
- data/lib/infoboxer/parser/paragraphs.rb +66 -0
- data/lib/infoboxer/parser/table.rb +132 -0
- data/lib/infoboxer/parser/template.rb +47 -0
- data/lib/infoboxer/parser/util.rb +73 -0
- data/lib/infoboxer/templates.rb +10 -0
- data/lib/infoboxer/templates/base.rb +82 -0
- data/lib/infoboxer/templates/set.rb +72 -0
- data/lib/infoboxer/tree.rb +70 -0
- data/lib/infoboxer/tree/compound.rb +81 -0
- data/lib/infoboxer/tree/document.rb +11 -0
- data/lib/infoboxer/tree/html.rb +76 -0
- data/lib/infoboxer/tree/image.rb +53 -0
- data/lib/infoboxer/tree/inline.rb +39 -0
- data/lib/infoboxer/tree/list.rb +160 -0
- data/lib/infoboxer/tree/node.rb +181 -0
- data/lib/infoboxer/tree/nodes.rb +185 -0
- data/lib/infoboxer/tree/paragraphs.rb +122 -0
- data/lib/infoboxer/tree/ref.rb +34 -0
- data/lib/infoboxer/tree/table.rb +89 -0
- data/lib/infoboxer/tree/template.rb +82 -0
- data/lib/infoboxer/tree/text.rb +60 -0
- data/lib/infoboxer/tree/wikilink.rb +83 -0
- data/lib/infoboxer/version.rb +4 -0
- data/profile/out/.gitkeep +0 -0
- data/profile/pages/argentina.txt +808 -0
- data/profile/pages/canada.wiki +544 -0
- data/profile/pages/ukraine.wiki +1006 -0
- data/profile/pages/usa.wiki +843 -0
- data/regression/pages/canada.wiki +544 -0
- data/regression/pages/chiang_mai.wiki +2615 -0
- data/regression/pages/south_america.wiki +640 -0
- data/regression/pages/ukraine.wiki +1006 -0
- data/regression/pages/usa.wiki +843 -0
- metadata +272 -0
@@ -0,0 +1,132 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
# http://en.wikipedia.org/wiki/Help:Table
|
5
|
+
module Table
|
6
|
+
include Tree
|
7
|
+
|
8
|
+
def table
|
9
|
+
@context.current =~ /^\s*{\|/ or
|
10
|
+
@context.fail!('Something went wrong: trying to parse not a table')
|
11
|
+
|
12
|
+
prms = table_params
|
13
|
+
table = Tree::Table.new(Nodes[], prms)
|
14
|
+
|
15
|
+
@context.next!
|
16
|
+
|
17
|
+
loop do
|
18
|
+
table_next_line(table) or break
|
19
|
+
@context.next!
|
20
|
+
end
|
21
|
+
|
22
|
+
# FIXME: not the most elegant way, huh?
|
23
|
+
table.children.reject!{|r| r.children.empty?}
|
24
|
+
|
25
|
+
table
|
26
|
+
end
|
27
|
+
|
28
|
+
def table_params
|
29
|
+
@context.skip(/\s*{\|/)
|
30
|
+
parse_params(@context.rest)
|
31
|
+
end
|
32
|
+
|
33
|
+
def table_next_line(table)
|
34
|
+
case @context.current
|
35
|
+
when /^\s*\|}(.*)$/ # table end
|
36
|
+
@context.scan(/^\s*\|}/)
|
37
|
+
return false # should not continue
|
38
|
+
|
39
|
+
when /^\s*!/ # heading (th) in a row
|
40
|
+
table_cells(table, TableHeading)
|
41
|
+
|
42
|
+
when /^\s*\|\+/ # caption
|
43
|
+
table_caption(table)
|
44
|
+
|
45
|
+
when /^\s*\|-(.*)$/ # row start
|
46
|
+
table_row(table, $1)
|
47
|
+
|
48
|
+
when /^\s*\|/ # cell in row
|
49
|
+
table_cells(table)
|
50
|
+
|
51
|
+
when /^\s*{{/ # template can be at row level
|
52
|
+
table_template(table)
|
53
|
+
|
54
|
+
when nil
|
55
|
+
@context.fail!("End of input before table ended!")
|
56
|
+
|
57
|
+
else
|
58
|
+
table_cell_cont(table)
|
59
|
+
end
|
60
|
+
true # should continue parsing
|
61
|
+
end
|
62
|
+
|
63
|
+
def table_row(table, param_str)
|
64
|
+
table.push_children(TableRow.new(Nodes[], parse_params(param_str)))
|
65
|
+
end
|
66
|
+
|
67
|
+
def table_caption(table)
|
68
|
+
@context.skip(/^\s*\|\+\s*/)
|
69
|
+
|
70
|
+
children = inline(/^\s*([|!]|{\|)/)
|
71
|
+
@context.prev! # compensate next! which will be done in table()
|
72
|
+
table.push_children(TableCaption.new(children.strip))
|
73
|
+
end
|
74
|
+
|
75
|
+
def table_cells(table, cell_class = TableCell)
|
76
|
+
table.push_children(TableRow.new()) unless table.children.last.is_a?(TableRow)
|
77
|
+
row = table.children.last
|
78
|
+
|
79
|
+
@context.skip(/\s*[!|]\s*/)
|
80
|
+
guarded_loop do
|
81
|
+
if @context.check(/[^|{|\[]+\|([^\|]|$)/)
|
82
|
+
params = parse_params(@context.scan_until(/\|/))
|
83
|
+
else
|
84
|
+
params = {}
|
85
|
+
end
|
86
|
+
content = short_inline(/(\|\||!!)/)
|
87
|
+
row.push_children(cell_class.new(content, params))
|
88
|
+
break if @context.eol?
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def table_template(table)
|
93
|
+
contents = paragraph(/^\s*([|!]|{\|)/).to_templates?
|
94
|
+
|
95
|
+
if (row = table.children.last).is_a?(TableRow)
|
96
|
+
if (cell = row.children.last).is_a?(BaseCell)
|
97
|
+
cell.push_children(*contents)
|
98
|
+
else
|
99
|
+
row.push_children(*contents)
|
100
|
+
end
|
101
|
+
else
|
102
|
+
table.push_children(*contents)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# On-the-fly TableCaption creation handles (real life) case, when
|
107
|
+
# table has "HTML caption":
|
108
|
+
# {|
|
109
|
+
# <caption>....</caption>
|
110
|
+
#
|
111
|
+
# Solution is NOT elegant or semantically "right", yet it works.
|
112
|
+
# Somehow.
|
113
|
+
#
|
114
|
+
def table_cell_cont(table)
|
115
|
+
container = case (last = table.children.last)
|
116
|
+
when TableRow
|
117
|
+
cell = last.children.last
|
118
|
+
cell.is_a?(BaseCell) ? cell : TableCaption.new
|
119
|
+
when TableCaption
|
120
|
+
last
|
121
|
+
when nil
|
122
|
+
TableCaption.new
|
123
|
+
else
|
124
|
+
@context.fail!("Multiline cell inside #{last}")
|
125
|
+
end
|
126
|
+
|
127
|
+
container.push_children(paragraph(/^\s*([|!]|{\|)/))
|
128
|
+
table.push_children(container) unless container.parent
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Template
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
# NB: here we are not distingish templates like {{Infobox|variable}}
|
8
|
+
# and "magic words" like {{formatnum:123}}
|
9
|
+
# Just calling all of them "templates". This behaviour will change
|
10
|
+
# in future, I presume
|
11
|
+
# More about magic words: https://www.mediawiki.org/wiki/Help:Magic_words
|
12
|
+
def template
|
13
|
+
name = @context.scan_continued_until(/\||:|}}/) or
|
14
|
+
@context.fail!("Template name not found")
|
15
|
+
|
16
|
+
name.strip!
|
17
|
+
vars = @context.eat_matched?('}}') ? Nodes[] : template_vars
|
18
|
+
@context.traits.templates.find(name).new(name, vars)
|
19
|
+
end
|
20
|
+
|
21
|
+
def template_vars
|
22
|
+
num = 1
|
23
|
+
res = Nodes[]
|
24
|
+
|
25
|
+
guarded_loop do
|
26
|
+
if @context.check(/\s*([^ =}|]+)\s*=\s*/)
|
27
|
+
name = @context.scan(/\s*([^ =]+)/).strip
|
28
|
+
@context.skip(/\s*=\s*/)
|
29
|
+
else
|
30
|
+
name = num
|
31
|
+
end
|
32
|
+
|
33
|
+
value = long_inline(/\||}}/)
|
34
|
+
unless value.empty? && name.is_a?(Numeric) # it was just empty line otherwise
|
35
|
+
res << Var.new(name.to_s, value)
|
36
|
+
end
|
37
|
+
|
38
|
+
break if @context.eat_matched?('}}')
|
39
|
+
@context.eof? and @context.fail!("Unexpected break of template variables: #{res}")
|
40
|
+
|
41
|
+
num += 1
|
42
|
+
end
|
43
|
+
res
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Util
|
5
|
+
attr_reader :re
|
6
|
+
|
7
|
+
FORMATTING = /(
|
8
|
+
'{2,5} | # bold, italic
|
9
|
+
\[\[ | # link
|
10
|
+
{{ | # template
|
11
|
+
\[[a-z]+:\/\/ | # external link
|
12
|
+
<nowiki[^>]*> | # reference
|
13
|
+
<ref[^>]*> | # nowiki
|
14
|
+
< # HTML tag
|
15
|
+
)/x
|
16
|
+
|
17
|
+
INLINE_EOL = %r[(?= # if we have ahead... (not scanned, just checked
|
18
|
+
</ref> | # <ref> closed
|
19
|
+
}} # or template closed
|
20
|
+
)]x
|
21
|
+
|
22
|
+
|
23
|
+
def make_regexps
|
24
|
+
{
|
25
|
+
file_prefix: /(#{@context.traits.file_prefix.join('|')}):/,
|
26
|
+
formatting: FORMATTING,
|
27
|
+
inline_until_cache: Hash.new{|h, r|
|
28
|
+
h[r] = Regexp.union(*[r, FORMATTING, /$/].compact.uniq)
|
29
|
+
},
|
30
|
+
short_inline_until_cache: Hash.new{|h, r|
|
31
|
+
h[r] = Regexp.union(*[r, INLINE_EOL, FORMATTING, /$/].compact.uniq)
|
32
|
+
}
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_params(str)
|
37
|
+
return {} unless str
|
38
|
+
|
39
|
+
scan = StringScanner.new(str)
|
40
|
+
params = {}
|
41
|
+
loop do
|
42
|
+
scan.skip(/\s*/)
|
43
|
+
name = scan.scan(/[^ \t=]+/) or break
|
44
|
+
scan.skip(/\s*/)
|
45
|
+
if scan.peek(1) == '='
|
46
|
+
scan.skip(/=\s*/)
|
47
|
+
q = scan.scan(/['"]/)
|
48
|
+
if q
|
49
|
+
value = scan.scan_until(/#{q}/).sub(q, '')
|
50
|
+
else
|
51
|
+
value = scan.scan_until(/\s|$/)
|
52
|
+
end
|
53
|
+
params[name.to_sym] = value
|
54
|
+
else
|
55
|
+
params[name.to_sym] = name
|
56
|
+
end
|
57
|
+
end
|
58
|
+
params
|
59
|
+
end
|
60
|
+
|
61
|
+
def guarded_loop
|
62
|
+
loop do
|
63
|
+
pos_before = @context.lineno, @context.colno
|
64
|
+
yield
|
65
|
+
pos_after = @context.lineno, @context.colno
|
66
|
+
pos_after == pos_before and
|
67
|
+
@context.fail!("Infinite loop on position #{pos_after.last}")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Infoboxer
|
2
|
+
module Templates
|
3
|
+
class Base < Tree::Template
|
4
|
+
include Tree
|
5
|
+
|
6
|
+
class << self
|
7
|
+
attr_accessor :template_name, :template_options
|
8
|
+
|
9
|
+
def inspect
|
10
|
+
template_name ? "Infoboxer::Templates::#{clean_name}" : super
|
11
|
+
end
|
12
|
+
|
13
|
+
def clean_name
|
14
|
+
template_name ? "Template[#{template_name}]" : 'Template'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def unnamed_variables
|
19
|
+
variables.select{|v| v.name =~ /^\d+$/}
|
20
|
+
end
|
21
|
+
|
22
|
+
def fetch(*patterns)
|
23
|
+
Nodes[*patterns.map{|p| variables.find(name: p)}.flatten]
|
24
|
+
end
|
25
|
+
|
26
|
+
def fetch_hash(*patterns)
|
27
|
+
fetch(*patterns).map{|v| [v.name, v]}.to_h
|
28
|
+
end
|
29
|
+
|
30
|
+
def fetch_date(*patterns)
|
31
|
+
components = fetch(*patterns)
|
32
|
+
components.pop while components.last.nil? && !components.empty?
|
33
|
+
|
34
|
+
if components.empty?
|
35
|
+
nil
|
36
|
+
else
|
37
|
+
Date.new(*components.map{|v| v.to_s.to_i})
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def ==(other)
|
42
|
+
other.kind_of?(Tree::Template) && _eq(other)
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def clean_class
|
48
|
+
if self.class.template_name == name
|
49
|
+
self.class.clean_name
|
50
|
+
else
|
51
|
+
super
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Renders all of its unnamed variables as space-separated text
|
57
|
+
# Also allows in-template navigation
|
58
|
+
class Show < Base
|
59
|
+
alias_method :children, :unnamed_variables
|
60
|
+
|
61
|
+
protected
|
62
|
+
|
63
|
+
def children_separator
|
64
|
+
' '
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class Replace < Base
|
69
|
+
def replace
|
70
|
+
fail(NotImplementedError, "Descendants should define :replace")
|
71
|
+
end
|
72
|
+
|
73
|
+
def text
|
74
|
+
replace
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class Literal < Base
|
79
|
+
alias_method :text, :name
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Templates
|
4
|
+
class Set
|
5
|
+
def initialize(&definitions)
|
6
|
+
@templates = []
|
7
|
+
define(&definitions) if definitions
|
8
|
+
end
|
9
|
+
|
10
|
+
def find(name)
|
11
|
+
_, template = @templates.detect{|m, t| m === name.downcase}
|
12
|
+
template || Base
|
13
|
+
end
|
14
|
+
|
15
|
+
def define(&definitions)
|
16
|
+
instance_eval(&definitions)
|
17
|
+
end
|
18
|
+
|
19
|
+
def clear
|
20
|
+
@templates.clear
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def template(name, options = {}, &definition)
|
26
|
+
setup_class(name, Base, options, &definition)
|
27
|
+
end
|
28
|
+
|
29
|
+
def replace(*replacements)
|
30
|
+
case
|
31
|
+
when replacements.count == 2 && replacements.all?{|r| r.is_a?(String)}
|
32
|
+
name, what = *replacements
|
33
|
+
setup_class(name, Replace) do
|
34
|
+
define_method(:replace) do
|
35
|
+
what
|
36
|
+
end
|
37
|
+
end
|
38
|
+
when replacements.count == 1 && replacements.first.is_a?(Hash)
|
39
|
+
replacements.first.each do |name, what|
|
40
|
+
replace(name, what)
|
41
|
+
end
|
42
|
+
else
|
43
|
+
fail(ArgumentError, "Can't call :replace with #{replacements.join(', ')}")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def show(*names)
|
48
|
+
names.each do |name|
|
49
|
+
setup_class(name, Show)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def literal(*names)
|
54
|
+
names.each do |name|
|
55
|
+
setup_class(name, Literal)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def setup_class(name, base_class, options = {}, &definition)
|
60
|
+
match = options.fetch(:match, name.downcase)
|
61
|
+
base = options.fetch(:base, base_class)
|
62
|
+
base = self.find(base) if base.is_a?(String)
|
63
|
+
|
64
|
+
Class.new(base, &definition).tap{|cls|
|
65
|
+
cls.template_name = name
|
66
|
+
cls.template_options = options
|
67
|
+
@templates.unshift [match, cls]
|
68
|
+
}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
# Infoboxer provides you with tree structure of the Wikipedia page,
|
4
|
+
# which you can introspect and navigate with ease. This tree structure
|
5
|
+
# tries to be simple, close to Wikipedia source and logical.
|
6
|
+
#
|
7
|
+
# You can always inspect entire page tree yourself:
|
8
|
+
#
|
9
|
+
# ```ruby
|
10
|
+
# page = Infoboxer.wp.get('Argentina')
|
11
|
+
# puts page.to_tree
|
12
|
+
# ```
|
13
|
+
#
|
14
|
+
# ## Inspecting and understanding single node
|
15
|
+
#
|
16
|
+
# Each tree node is descendant of {Tree::Node}, so you should look
|
17
|
+
# at this class to understand what you can do.
|
18
|
+
#
|
19
|
+
# Alongside with basic methods, defined in Node class, some useful
|
20
|
+
# utility methods are defined in subclasses.
|
21
|
+
#
|
22
|
+
# Here's full list of subclasses, representing real nodes, with their
|
23
|
+
# respective roles:
|
24
|
+
#
|
25
|
+
# * inline markup: {Text}, {Bold}, {Italic}, {BoldItalic}, {Wikilink},
|
26
|
+
# {ExternalLink}, {Image};
|
27
|
+
# * embedded HTML: {HTMLTag}, {HTMLOpeningTag}, {HTMLClosingTag};
|
28
|
+
# * paragraph-level nodes: {Heading}, {Paragraph}, {Pre}, {HR};
|
29
|
+
# * lists: {OrderedList}, {UnorderedList}, {DefinitionList}, {ListItem},
|
30
|
+
# {DTerm}, {DDefinition};
|
31
|
+
# * tables: {Table}, {TableCaption}, {TableRow}, {TableHeading}, {TableCell};
|
32
|
+
# * special elements: {Template}, {Ref}.
|
33
|
+
#
|
34
|
+
# ## Tree navigation
|
35
|
+
#
|
36
|
+
# {Tree::Node} class has a standard list of methods for traversing tree
|
37
|
+
# upwards, downwards and sideways: `children`, `parent`, `siblings`,
|
38
|
+
# `index`. Read through class documentation for their detailed
|
39
|
+
# descriptions.
|
40
|
+
#
|
41
|
+
# {Navigation} module contains more advanced navigational functionality,
|
42
|
+
# like XPath-like selectors, friendly shortcuts, breakup of document
|
43
|
+
# into logical "sections" and so on.
|
44
|
+
#
|
45
|
+
# Most of navigational and other Node's methods return {Nodes} type,
|
46
|
+
# which is an `Array` descendant with additional functionality.
|
47
|
+
#
|
48
|
+
# ## Complex data extraction
|
49
|
+
#
|
50
|
+
# Most of uniform, machine-extractable data in Wikipedia is stored in
|
51
|
+
# templates and tables. There's entire {Templates} module, which is
|
52
|
+
# documented explaining what you can do about Wikipedia templates, how
|
53
|
+
# to understand them and use information. Also, you can look at {Table}
|
54
|
+
# class, which for now is not that powerful, yet allows you to extract
|
55
|
+
# some columns and rows.
|
56
|
+
#
|
57
|
+
# Also, consider that WIKIpedia is maid of WIKIlinks, and {Wikilink#follow}
|
58
|
+
# (as well as {Nodes#follow} for multiple links at once) is you good friend.
|
59
|
+
#
|
60
|
+
module Tree
|
61
|
+
require_relative 'tree/node'
|
62
|
+
require_relative 'tree/nodes'
|
63
|
+
|
64
|
+
%w[text compound inline
|
65
|
+
image html paragraphs list template table ref
|
66
|
+
document].each do |type|
|
67
|
+
require_relative "tree/#{type}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|