infoboxer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.dokaz +1 -0
- data/.yardopts +1 -0
- data/LICENSE.txt +22 -0
- data/Parsing.md +33 -0
- data/README.md +115 -0
- data/examples/output/.gitkeep +0 -0
- data/examples/pages/argentina.wiki +808 -0
- data/examples/to_text.rb +8 -0
- data/examples/tree.rb +8 -0
- data/infoboxer.gemspec +43 -0
- data/lib/infoboxer.rb +196 -0
- data/lib/infoboxer/core_ext.rb +10 -0
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
- data/lib/infoboxer/media_wiki.rb +162 -0
- data/lib/infoboxer/media_wiki/page.rb +38 -0
- data/lib/infoboxer/media_wiki/traits.rb +60 -0
- data/lib/infoboxer/navigation.rb +84 -0
- data/lib/infoboxer/navigation/lookup.rb +216 -0
- data/lib/infoboxer/navigation/sections.rb +179 -0
- data/lib/infoboxer/navigation/selector.rb +59 -0
- data/lib/infoboxer/navigation/shortcuts.rb +165 -0
- data/lib/infoboxer/parser.rb +71 -0
- data/lib/infoboxer/parser/context.rb +165 -0
- data/lib/infoboxer/parser/html.rb +58 -0
- data/lib/infoboxer/parser/image.rb +59 -0
- data/lib/infoboxer/parser/inline.rb +142 -0
- data/lib/infoboxer/parser/paragraphs.rb +66 -0
- data/lib/infoboxer/parser/table.rb +132 -0
- data/lib/infoboxer/parser/template.rb +47 -0
- data/lib/infoboxer/parser/util.rb +73 -0
- data/lib/infoboxer/templates.rb +10 -0
- data/lib/infoboxer/templates/base.rb +82 -0
- data/lib/infoboxer/templates/set.rb +72 -0
- data/lib/infoboxer/tree.rb +70 -0
- data/lib/infoboxer/tree/compound.rb +81 -0
- data/lib/infoboxer/tree/document.rb +11 -0
- data/lib/infoboxer/tree/html.rb +76 -0
- data/lib/infoboxer/tree/image.rb +53 -0
- data/lib/infoboxer/tree/inline.rb +39 -0
- data/lib/infoboxer/tree/list.rb +160 -0
- data/lib/infoboxer/tree/node.rb +181 -0
- data/lib/infoboxer/tree/nodes.rb +185 -0
- data/lib/infoboxer/tree/paragraphs.rb +122 -0
- data/lib/infoboxer/tree/ref.rb +34 -0
- data/lib/infoboxer/tree/table.rb +89 -0
- data/lib/infoboxer/tree/template.rb +82 -0
- data/lib/infoboxer/tree/text.rb +60 -0
- data/lib/infoboxer/tree/wikilink.rb +83 -0
- data/lib/infoboxer/version.rb +4 -0
- data/profile/out/.gitkeep +0 -0
- data/profile/pages/argentina.txt +808 -0
- data/profile/pages/canada.wiki +544 -0
- data/profile/pages/ukraine.wiki +1006 -0
- data/profile/pages/usa.wiki +843 -0
- data/regression/pages/canada.wiki +544 -0
- data/regression/pages/chiang_mai.wiki +2615 -0
- data/regression/pages/south_america.wiki +640 -0
- data/regression/pages/ukraine.wiki +1006 -0
- data/regression/pages/usa.wiki +843 -0
- metadata +272 -0
@@ -0,0 +1,132 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
# http://en.wikipedia.org/wiki/Help:Table
|
5
|
+
module Table
|
6
|
+
include Tree
|
7
|
+
|
8
|
+
def table
|
9
|
+
@context.current =~ /^\s*{\|/ or
|
10
|
+
@context.fail!('Something went wrong: trying to parse not a table')
|
11
|
+
|
12
|
+
prms = table_params
|
13
|
+
table = Tree::Table.new(Nodes[], prms)
|
14
|
+
|
15
|
+
@context.next!
|
16
|
+
|
17
|
+
loop do
|
18
|
+
table_next_line(table) or break
|
19
|
+
@context.next!
|
20
|
+
end
|
21
|
+
|
22
|
+
# FIXME: not the most elegant way, huh?
|
23
|
+
table.children.reject!{|r| r.children.empty?}
|
24
|
+
|
25
|
+
table
|
26
|
+
end
|
27
|
+
|
28
|
+
def table_params
|
29
|
+
@context.skip(/\s*{\|/)
|
30
|
+
parse_params(@context.rest)
|
31
|
+
end
|
32
|
+
|
33
|
+
def table_next_line(table)
|
34
|
+
case @context.current
|
35
|
+
when /^\s*\|}(.*)$/ # table end
|
36
|
+
@context.scan(/^\s*\|}/)
|
37
|
+
return false # should not continue
|
38
|
+
|
39
|
+
when /^\s*!/ # heading (th) in a row
|
40
|
+
table_cells(table, TableHeading)
|
41
|
+
|
42
|
+
when /^\s*\|\+/ # caption
|
43
|
+
table_caption(table)
|
44
|
+
|
45
|
+
when /^\s*\|-(.*)$/ # row start
|
46
|
+
table_row(table, $1)
|
47
|
+
|
48
|
+
when /^\s*\|/ # cell in row
|
49
|
+
table_cells(table)
|
50
|
+
|
51
|
+
when /^\s*{{/ # template can be at row level
|
52
|
+
table_template(table)
|
53
|
+
|
54
|
+
when nil
|
55
|
+
@context.fail!("End of input before table ended!")
|
56
|
+
|
57
|
+
else
|
58
|
+
table_cell_cont(table)
|
59
|
+
end
|
60
|
+
true # should continue parsing
|
61
|
+
end
|
62
|
+
|
63
|
+
def table_row(table, param_str)
|
64
|
+
table.push_children(TableRow.new(Nodes[], parse_params(param_str)))
|
65
|
+
end
|
66
|
+
|
67
|
+
def table_caption(table)
|
68
|
+
@context.skip(/^\s*\|\+\s*/)
|
69
|
+
|
70
|
+
children = inline(/^\s*([|!]|{\|)/)
|
71
|
+
@context.prev! # compensate next! which will be done in table()
|
72
|
+
table.push_children(TableCaption.new(children.strip))
|
73
|
+
end
|
74
|
+
|
75
|
+
def table_cells(table, cell_class = TableCell)
|
76
|
+
table.push_children(TableRow.new()) unless table.children.last.is_a?(TableRow)
|
77
|
+
row = table.children.last
|
78
|
+
|
79
|
+
@context.skip(/\s*[!|]\s*/)
|
80
|
+
guarded_loop do
|
81
|
+
if @context.check(/[^|{|\[]+\|([^\|]|$)/)
|
82
|
+
params = parse_params(@context.scan_until(/\|/))
|
83
|
+
else
|
84
|
+
params = {}
|
85
|
+
end
|
86
|
+
content = short_inline(/(\|\||!!)/)
|
87
|
+
row.push_children(cell_class.new(content, params))
|
88
|
+
break if @context.eol?
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def table_template(table)
|
93
|
+
contents = paragraph(/^\s*([|!]|{\|)/).to_templates?
|
94
|
+
|
95
|
+
if (row = table.children.last).is_a?(TableRow)
|
96
|
+
if (cell = row.children.last).is_a?(BaseCell)
|
97
|
+
cell.push_children(*contents)
|
98
|
+
else
|
99
|
+
row.push_children(*contents)
|
100
|
+
end
|
101
|
+
else
|
102
|
+
table.push_children(*contents)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# On-the-fly TableCaption creation handles (real life) case, when
|
107
|
+
# table has "HTML caption":
|
108
|
+
# {|
|
109
|
+
# <caption>....</caption>
|
110
|
+
#
|
111
|
+
# Solution is NOT elegant or semantically "right", yet it works.
|
112
|
+
# Somehow.
|
113
|
+
#
|
114
|
+
def table_cell_cont(table)
|
115
|
+
container = case (last = table.children.last)
|
116
|
+
when TableRow
|
117
|
+
cell = last.children.last
|
118
|
+
cell.is_a?(BaseCell) ? cell : TableCaption.new
|
119
|
+
when TableCaption
|
120
|
+
last
|
121
|
+
when nil
|
122
|
+
TableCaption.new
|
123
|
+
else
|
124
|
+
@context.fail!("Multiline cell inside #{last}")
|
125
|
+
end
|
126
|
+
|
127
|
+
container.push_children(paragraph(/^\s*([|!]|{\|)/))
|
128
|
+
table.push_children(container) unless container.parent
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Template
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
# NB: here we are not distingish templates like {{Infobox|variable}}
|
8
|
+
# and "magic words" like {{formatnum:123}}
|
9
|
+
# Just calling all of them "templates". This behaviour will change
|
10
|
+
# in future, I presume
|
11
|
+
# More about magic words: https://www.mediawiki.org/wiki/Help:Magic_words
|
12
|
+
def template
|
13
|
+
name = @context.scan_continued_until(/\||:|}}/) or
|
14
|
+
@context.fail!("Template name not found")
|
15
|
+
|
16
|
+
name.strip!
|
17
|
+
vars = @context.eat_matched?('}}') ? Nodes[] : template_vars
|
18
|
+
@context.traits.templates.find(name).new(name, vars)
|
19
|
+
end
|
20
|
+
|
21
|
+
def template_vars
|
22
|
+
num = 1
|
23
|
+
res = Nodes[]
|
24
|
+
|
25
|
+
guarded_loop do
|
26
|
+
if @context.check(/\s*([^ =}|]+)\s*=\s*/)
|
27
|
+
name = @context.scan(/\s*([^ =]+)/).strip
|
28
|
+
@context.skip(/\s*=\s*/)
|
29
|
+
else
|
30
|
+
name = num
|
31
|
+
end
|
32
|
+
|
33
|
+
value = long_inline(/\||}}/)
|
34
|
+
unless value.empty? && name.is_a?(Numeric) # it was just empty line otherwise
|
35
|
+
res << Var.new(name.to_s, value)
|
36
|
+
end
|
37
|
+
|
38
|
+
break if @context.eat_matched?('}}')
|
39
|
+
@context.eof? and @context.fail!("Unexpected break of template variables: #{res}")
|
40
|
+
|
41
|
+
num += 1
|
42
|
+
end
|
43
|
+
res
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Util
|
5
|
+
attr_reader :re
|
6
|
+
|
7
|
+
FORMATTING = /(
|
8
|
+
'{2,5} | # bold, italic
|
9
|
+
\[\[ | # link
|
10
|
+
{{ | # template
|
11
|
+
\[[a-z]+:\/\/ | # external link
|
12
|
+
<nowiki[^>]*> | # reference
|
13
|
+
<ref[^>]*> | # nowiki
|
14
|
+
< # HTML tag
|
15
|
+
)/x
|
16
|
+
|
17
|
+
INLINE_EOL = %r[(?= # if we have ahead... (not scanned, just checked
|
18
|
+
</ref> | # <ref> closed
|
19
|
+
}} # or template closed
|
20
|
+
)]x
|
21
|
+
|
22
|
+
|
23
|
+
def make_regexps
|
24
|
+
{
|
25
|
+
file_prefix: /(#{@context.traits.file_prefix.join('|')}):/,
|
26
|
+
formatting: FORMATTING,
|
27
|
+
inline_until_cache: Hash.new{|h, r|
|
28
|
+
h[r] = Regexp.union(*[r, FORMATTING, /$/].compact.uniq)
|
29
|
+
},
|
30
|
+
short_inline_until_cache: Hash.new{|h, r|
|
31
|
+
h[r] = Regexp.union(*[r, INLINE_EOL, FORMATTING, /$/].compact.uniq)
|
32
|
+
}
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_params(str)
|
37
|
+
return {} unless str
|
38
|
+
|
39
|
+
scan = StringScanner.new(str)
|
40
|
+
params = {}
|
41
|
+
loop do
|
42
|
+
scan.skip(/\s*/)
|
43
|
+
name = scan.scan(/[^ \t=]+/) or break
|
44
|
+
scan.skip(/\s*/)
|
45
|
+
if scan.peek(1) == '='
|
46
|
+
scan.skip(/=\s*/)
|
47
|
+
q = scan.scan(/['"]/)
|
48
|
+
if q
|
49
|
+
value = scan.scan_until(/#{q}/).sub(q, '')
|
50
|
+
else
|
51
|
+
value = scan.scan_until(/\s|$/)
|
52
|
+
end
|
53
|
+
params[name.to_sym] = value
|
54
|
+
else
|
55
|
+
params[name.to_sym] = name
|
56
|
+
end
|
57
|
+
end
|
58
|
+
params
|
59
|
+
end
|
60
|
+
|
61
|
+
def guarded_loop
|
62
|
+
loop do
|
63
|
+
pos_before = @context.lineno, @context.colno
|
64
|
+
yield
|
65
|
+
pos_after = @context.lineno, @context.colno
|
66
|
+
pos_after == pos_before and
|
67
|
+
@context.fail!("Infinite loop on position #{pos_after.last}")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Infoboxer
|
2
|
+
module Templates
|
3
|
+
class Base < Tree::Template
|
4
|
+
include Tree
|
5
|
+
|
6
|
+
class << self
|
7
|
+
attr_accessor :template_name, :template_options
|
8
|
+
|
9
|
+
def inspect
|
10
|
+
template_name ? "Infoboxer::Templates::#{clean_name}" : super
|
11
|
+
end
|
12
|
+
|
13
|
+
def clean_name
|
14
|
+
template_name ? "Template[#{template_name}]" : 'Template'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def unnamed_variables
|
19
|
+
variables.select{|v| v.name =~ /^\d+$/}
|
20
|
+
end
|
21
|
+
|
22
|
+
def fetch(*patterns)
|
23
|
+
Nodes[*patterns.map{|p| variables.find(name: p)}.flatten]
|
24
|
+
end
|
25
|
+
|
26
|
+
def fetch_hash(*patterns)
|
27
|
+
fetch(*patterns).map{|v| [v.name, v]}.to_h
|
28
|
+
end
|
29
|
+
|
30
|
+
def fetch_date(*patterns)
|
31
|
+
components = fetch(*patterns)
|
32
|
+
components.pop while components.last.nil? && !components.empty?
|
33
|
+
|
34
|
+
if components.empty?
|
35
|
+
nil
|
36
|
+
else
|
37
|
+
Date.new(*components.map{|v| v.to_s.to_i})
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def ==(other)
|
42
|
+
other.kind_of?(Tree::Template) && _eq(other)
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def clean_class
|
48
|
+
if self.class.template_name == name
|
49
|
+
self.class.clean_name
|
50
|
+
else
|
51
|
+
super
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Renders all of its unnamed variables as space-separated text
|
57
|
+
# Also allows in-template navigation
|
58
|
+
class Show < Base
|
59
|
+
alias_method :children, :unnamed_variables
|
60
|
+
|
61
|
+
protected
|
62
|
+
|
63
|
+
def children_separator
|
64
|
+
' '
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class Replace < Base
|
69
|
+
def replace
|
70
|
+
fail(NotImplementedError, "Descendants should define :replace")
|
71
|
+
end
|
72
|
+
|
73
|
+
def text
|
74
|
+
replace
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class Literal < Base
|
79
|
+
alias_method :text, :name
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Templates
|
4
|
+
class Set
|
5
|
+
def initialize(&definitions)
|
6
|
+
@templates = []
|
7
|
+
define(&definitions) if definitions
|
8
|
+
end
|
9
|
+
|
10
|
+
def find(name)
|
11
|
+
_, template = @templates.detect{|m, t| m === name.downcase}
|
12
|
+
template || Base
|
13
|
+
end
|
14
|
+
|
15
|
+
def define(&definitions)
|
16
|
+
instance_eval(&definitions)
|
17
|
+
end
|
18
|
+
|
19
|
+
def clear
|
20
|
+
@templates.clear
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def template(name, options = {}, &definition)
|
26
|
+
setup_class(name, Base, options, &definition)
|
27
|
+
end
|
28
|
+
|
29
|
+
def replace(*replacements)
|
30
|
+
case
|
31
|
+
when replacements.count == 2 && replacements.all?{|r| r.is_a?(String)}
|
32
|
+
name, what = *replacements
|
33
|
+
setup_class(name, Replace) do
|
34
|
+
define_method(:replace) do
|
35
|
+
what
|
36
|
+
end
|
37
|
+
end
|
38
|
+
when replacements.count == 1 && replacements.first.is_a?(Hash)
|
39
|
+
replacements.first.each do |name, what|
|
40
|
+
replace(name, what)
|
41
|
+
end
|
42
|
+
else
|
43
|
+
fail(ArgumentError, "Can't call :replace with #{replacements.join(', ')}")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def show(*names)
|
48
|
+
names.each do |name|
|
49
|
+
setup_class(name, Show)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def literal(*names)
|
54
|
+
names.each do |name|
|
55
|
+
setup_class(name, Literal)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def setup_class(name, base_class, options = {}, &definition)
|
60
|
+
match = options.fetch(:match, name.downcase)
|
61
|
+
base = options.fetch(:base, base_class)
|
62
|
+
base = self.find(base) if base.is_a?(String)
|
63
|
+
|
64
|
+
Class.new(base, &definition).tap{|cls|
|
65
|
+
cls.template_name = name
|
66
|
+
cls.template_options = options
|
67
|
+
@templates.unshift [match, cls]
|
68
|
+
}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
# Infoboxer provides you with tree structure of the Wikipedia page,
|
4
|
+
# which you can introspect and navigate with ease. This tree structure
|
5
|
+
# tries to be simple, close to Wikipedia source and logical.
|
6
|
+
#
|
7
|
+
# You can always inspect entire page tree yourself:
|
8
|
+
#
|
9
|
+
# ```ruby
|
10
|
+
# page = Infoboxer.wp.get('Argentina')
|
11
|
+
# puts page.to_tree
|
12
|
+
# ```
|
13
|
+
#
|
14
|
+
# ## Inspecting and understanding single node
|
15
|
+
#
|
16
|
+
# Each tree node is descendant of {Tree::Node}, so you should look
|
17
|
+
# at this class to understand what you can do.
|
18
|
+
#
|
19
|
+
# Alongside with basic methods, defined in Node class, some useful
|
20
|
+
# utility methods are defined in subclasses.
|
21
|
+
#
|
22
|
+
# Here's full list of subclasses, representing real nodes, with their
|
23
|
+
# respective roles:
|
24
|
+
#
|
25
|
+
# * inline markup: {Text}, {Bold}, {Italic}, {BoldItalic}, {Wikilink},
|
26
|
+
# {ExternalLink}, {Image};
|
27
|
+
# * embedded HTML: {HTMLTag}, {HTMLOpeningTag}, {HTMLClosingTag};
|
28
|
+
# * paragraph-level nodes: {Heading}, {Paragraph}, {Pre}, {HR};
|
29
|
+
# * lists: {OrderedList}, {UnorderedList}, {DefinitionList}, {ListItem},
|
30
|
+
# {DTerm}, {DDefinition};
|
31
|
+
# * tables: {Table}, {TableCaption}, {TableRow}, {TableHeading}, {TableCell};
|
32
|
+
# * special elements: {Template}, {Ref}.
|
33
|
+
#
|
34
|
+
# ## Tree navigation
|
35
|
+
#
|
36
|
+
# {Tree::Node} class has a standard list of methods for traversing tree
|
37
|
+
# upwards, downwards and sideways: `children`, `parent`, `siblings`,
|
38
|
+
# `index`. Read through class documentation for their detailed
|
39
|
+
# descriptions.
|
40
|
+
#
|
41
|
+
# {Navigation} module contains more advanced navigational functionality,
|
42
|
+
# like XPath-like selectors, friendly shortcuts, breakup of document
|
43
|
+
# into logical "sections" and so on.
|
44
|
+
#
|
45
|
+
# Most of navigational and other Node's methods return {Nodes} type,
|
46
|
+
# which is an `Array` descendant with additional functionality.
|
47
|
+
#
|
48
|
+
# ## Complex data extraction
|
49
|
+
#
|
50
|
+
# Most of uniform, machine-extractable data in Wikipedia is stored in
|
51
|
+
# templates and tables. There's entire {Templates} module, which is
|
52
|
+
# documented explaining what you can do about Wikipedia templates, how
|
53
|
+
# to understand them and use information. Also, you can look at {Table}
|
54
|
+
# class, which for now is not that powerful, yet allows you to extract
|
55
|
+
# some columns and rows.
|
56
|
+
#
|
57
|
+
# Also, consider that WIKIpedia is maid of WIKIlinks, and {Wikilink#follow}
|
58
|
+
# (as well as {Nodes#follow} for multiple links at once) is you good friend.
|
59
|
+
#
|
60
|
+
module Tree
|
61
|
+
require_relative 'tree/node'
|
62
|
+
require_relative 'tree/nodes'
|
63
|
+
|
64
|
+
%w[text compound inline
|
65
|
+
image html paragraphs list template table ref
|
66
|
+
document].each do |type|
|
67
|
+
require_relative "tree/#{type}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|