infoboxer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.dokaz +1 -0
- data/.yardopts +1 -0
- data/LICENSE.txt +22 -0
- data/Parsing.md +33 -0
- data/README.md +115 -0
- data/examples/output/.gitkeep +0 -0
- data/examples/pages/argentina.wiki +808 -0
- data/examples/to_text.rb +8 -0
- data/examples/tree.rb +8 -0
- data/infoboxer.gemspec +43 -0
- data/lib/infoboxer.rb +196 -0
- data/lib/infoboxer/core_ext.rb +10 -0
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
- data/lib/infoboxer/media_wiki.rb +162 -0
- data/lib/infoboxer/media_wiki/page.rb +38 -0
- data/lib/infoboxer/media_wiki/traits.rb +60 -0
- data/lib/infoboxer/navigation.rb +84 -0
- data/lib/infoboxer/navigation/lookup.rb +216 -0
- data/lib/infoboxer/navigation/sections.rb +179 -0
- data/lib/infoboxer/navigation/selector.rb +59 -0
- data/lib/infoboxer/navigation/shortcuts.rb +165 -0
- data/lib/infoboxer/parser.rb +71 -0
- data/lib/infoboxer/parser/context.rb +165 -0
- data/lib/infoboxer/parser/html.rb +58 -0
- data/lib/infoboxer/parser/image.rb +59 -0
- data/lib/infoboxer/parser/inline.rb +142 -0
- data/lib/infoboxer/parser/paragraphs.rb +66 -0
- data/lib/infoboxer/parser/table.rb +132 -0
- data/lib/infoboxer/parser/template.rb +47 -0
- data/lib/infoboxer/parser/util.rb +73 -0
- data/lib/infoboxer/templates.rb +10 -0
- data/lib/infoboxer/templates/base.rb +82 -0
- data/lib/infoboxer/templates/set.rb +72 -0
- data/lib/infoboxer/tree.rb +70 -0
- data/lib/infoboxer/tree/compound.rb +81 -0
- data/lib/infoboxer/tree/document.rb +11 -0
- data/lib/infoboxer/tree/html.rb +76 -0
- data/lib/infoboxer/tree/image.rb +53 -0
- data/lib/infoboxer/tree/inline.rb +39 -0
- data/lib/infoboxer/tree/list.rb +160 -0
- data/lib/infoboxer/tree/node.rb +181 -0
- data/lib/infoboxer/tree/nodes.rb +185 -0
- data/lib/infoboxer/tree/paragraphs.rb +122 -0
- data/lib/infoboxer/tree/ref.rb +34 -0
- data/lib/infoboxer/tree/table.rb +89 -0
- data/lib/infoboxer/tree/template.rb +82 -0
- data/lib/infoboxer/tree/text.rb +60 -0
- data/lib/infoboxer/tree/wikilink.rb +83 -0
- data/lib/infoboxer/version.rb +4 -0
- data/profile/out/.gitkeep +0 -0
- data/profile/pages/argentina.txt +808 -0
- data/profile/pages/canada.wiki +544 -0
- data/profile/pages/ukraine.wiki +1006 -0
- data/profile/pages/usa.wiki +843 -0
- data/regression/pages/canada.wiki +544 -0
- data/regression/pages/chiang_mai.wiki +2615 -0
- data/regression/pages/south_america.wiki +640 -0
- data/regression/pages/ukraine.wiki +1006 -0
- data/regression/pages/usa.wiki +843 -0
- metadata +272 -0
@@ -0,0 +1,179 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Navigation
|
4
|
+
# `Sections` module provides logical view on document strcture.
|
5
|
+
#
|
6
|
+
# From this module's point of view, each {Tree::Document Document} is a
|
7
|
+
# {Sections::Container Sections::Container}, which consists of
|
8
|
+
# {Sections::Container#intro} (before first heading) and a set of
|
9
|
+
# nested {Sections::Container#sections}.
|
10
|
+
#
|
11
|
+
# Each document node, in turn, provides method {Sections::Node#in_sections},
|
12
|
+
# allowing you to receive list of sections, which contains current
|
13
|
+
# node.
|
14
|
+
#
|
15
|
+
# **NB**: Sections are "virtual" nodes, they are not, in fact, in
|
16
|
+
# documents tree. So, you can be surprised with:
|
17
|
+
#
|
18
|
+
# ```ruby
|
19
|
+
# document.sections # => list of Section instances
|
20
|
+
# document.lookup(:Section) # => []
|
21
|
+
#
|
22
|
+
# paragraph.in_sections # => list of sections
|
23
|
+
# paragraph.
|
24
|
+
# lookup_parents(:Section) # => []
|
25
|
+
# ```
|
26
|
+
module Sections
|
27
|
+
# This module is included in {Tree::Document Document}, allowing
|
28
|
+
# you to navigate through document's logical sections (and also
|
29
|
+
# included in each {Sections::Section} instance, allowing to navigate
|
30
|
+
# recursively).
|
31
|
+
#
|
32
|
+
# See also {Sections parent module} docs.
|
33
|
+
module Container
|
34
|
+
# All container's paragraph-level nodes before first heading.
|
35
|
+
#
|
36
|
+
# @return {Tree::Nodes}
|
37
|
+
def intro
|
38
|
+
children.
|
39
|
+
take_while{|n| !n.is_a?(Tree::Heading)}.
|
40
|
+
select{|n| n.is_a?(Tree::BaseParagraph)}
|
41
|
+
end
|
42
|
+
|
43
|
+
# List of sections inside current container.
|
44
|
+
#
|
45
|
+
# Examples of usage:
|
46
|
+
#
|
47
|
+
# ```ruby
|
48
|
+
# document.sections # all top-level sections
|
49
|
+
# document.sections('Culture') # only "Culture" section
|
50
|
+
# document.sections(/^List of/) # all sections with heading matching pattern
|
51
|
+
#
|
52
|
+
# document.
|
53
|
+
# sections('Culture'). # long way of recieve nested section
|
54
|
+
# sections('Music') # (Culture / Music)
|
55
|
+
#
|
56
|
+
# document.
|
57
|
+
# sections('Culture', 'Music') # the same as above
|
58
|
+
#
|
59
|
+
# document.
|
60
|
+
# sections('Culture' => 'Music') # pretty-looking version for 2 levels of nesting
|
61
|
+
# ```
|
62
|
+
#
|
63
|
+
# @return {Tree::Nodes<Section>}
|
64
|
+
def sections(*names)
|
65
|
+
@sections ||= make_sections
|
66
|
+
|
67
|
+
if names.first.is_a?(Hash)
|
68
|
+
h = names.shift
|
69
|
+
h.count == 1 or fail(ArgumentError, "Undefined behavior with #{h}")
|
70
|
+
names.unshift(h.keys.first, h.values.first)
|
71
|
+
end
|
72
|
+
|
73
|
+
case names.count
|
74
|
+
when 0
|
75
|
+
@sections
|
76
|
+
when 1
|
77
|
+
@sections.select{|s| names.first === s.heading.text_}
|
78
|
+
else
|
79
|
+
@sections.select{|s| names.first === s.heading.text_}.sections(*names[1..-1])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def make_sections
|
86
|
+
res = Tree::Nodes[]
|
87
|
+
return res if headings.empty?
|
88
|
+
level = headings.first.level
|
89
|
+
|
90
|
+
children.
|
91
|
+
chunk{|n| n.matches?(Tree::Heading, level: level)}.
|
92
|
+
drop_while{|is_heading, nodes| !is_heading}.
|
93
|
+
each do |is_heading, nodes|
|
94
|
+
if is_heading
|
95
|
+
nodes.each do |node|
|
96
|
+
res << Section.new(node)
|
97
|
+
end
|
98
|
+
else
|
99
|
+
res.last.push_children(*nodes)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
res
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Part of {Sections} navigation, allowing each node to know exact
|
108
|
+
# list of sections it contained in.
|
109
|
+
#
|
110
|
+
# See also {Sections parent module} documentation.
|
111
|
+
module Node
|
112
|
+
# List of sections current node contained in (bottom-to-top:
|
113
|
+
# smallest section first).
|
114
|
+
#
|
115
|
+
# @return {Tree::Nodes<Section>}
|
116
|
+
def in_sections
|
117
|
+
main_node = parent.is_a?(Tree::Document) ? self : lookup_parents[-2]
|
118
|
+
|
119
|
+
heading = if main_node.is_a?(Tree::Heading)
|
120
|
+
main_node.lookup_prev_siblings(Tree::Heading, level: main_node.level - 1).last
|
121
|
+
else
|
122
|
+
main_node.lookup_prev_siblings(Tree::Heading).last
|
123
|
+
end
|
124
|
+
return Tree::Nodes[] unless heading
|
125
|
+
|
126
|
+
section = Section.new(heading,
|
127
|
+
heading.next_siblings.
|
128
|
+
take_while{|n| !n.is_a?(Tree::Heading) || n.level < heading.level}
|
129
|
+
)
|
130
|
+
Tree::Nodes[section, *heading.in_sections]
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Part of {Sections} navigation, allowing chains of section search.
|
135
|
+
#
|
136
|
+
# See {Sections parent module} documentation.
|
137
|
+
module Nodes
|
138
|
+
# @!method sections(*names)
|
139
|
+
# @!method in_sections
|
140
|
+
|
141
|
+
[:sections, :in_sections].each do |sym|
|
142
|
+
define_method(sym){|*args|
|
143
|
+
make_nodes map{|n| n.send(sym, *args)}
|
144
|
+
}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Virtual node, representing logical section of the document.
|
149
|
+
# Is not, in fact, in the tree.
|
150
|
+
#
|
151
|
+
# See {Sections parent module} documentation for details.
|
152
|
+
class Section < Tree::Compound
|
153
|
+
def initialize(heading, children = Tree::Nodes[])
|
154
|
+
# no super: we don't wont to rewriter children's parent
|
155
|
+
@children = Tree::Nodes[*children]
|
156
|
+
@heading = heading
|
157
|
+
end
|
158
|
+
|
159
|
+
# Section's heading.
|
160
|
+
#
|
161
|
+
# @return {Tree::Heading}
|
162
|
+
attr_reader :heading
|
163
|
+
|
164
|
+
# no rewriting of parent, again
|
165
|
+
def push_children(*nodes)
|
166
|
+
nodes.each do |n|
|
167
|
+
@children << n
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def empty?
|
172
|
+
false
|
173
|
+
end
|
174
|
+
|
175
|
+
include Container
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Navigation
|
4
|
+
module Lookup
|
5
|
+
# Incapsulates storage of selectors, used in {Lookup::Node node lookup}.
|
6
|
+
#
|
7
|
+
# See {Lookup::Node Lookup::Node} for detailed explanation of available selectors.
|
8
|
+
class Selector
|
9
|
+
include ProcMe
|
10
|
+
|
11
|
+
def initialize(*arg, &block)
|
12
|
+
@arg = [arg, block].flatten.compact.map(&method(:sym_to_class))
|
13
|
+
@arg.each do |a|
|
14
|
+
a.reject!{|k, v| v.nil?} if a.is_a?(Hash)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :arg
|
19
|
+
|
20
|
+
def ==(other)
|
21
|
+
self.class == other.class && arg == other.arg
|
22
|
+
end
|
23
|
+
|
24
|
+
def inspect
|
25
|
+
"#<Selector(#{@arg.map(&:to_s).join(', ')})>"
|
26
|
+
end
|
27
|
+
|
28
|
+
def matches?(node)
|
29
|
+
@arg.all?{|a| arg_matches?(a, node)}
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def sym_to_class(a)
|
35
|
+
if a.is_a?(Symbol) && a =~ /^[A-Z][a-zA-Z]+$/ && Tree.const_defined?(a)
|
36
|
+
Tree.const_get(a)
|
37
|
+
else
|
38
|
+
a
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def arg_matches?(check, node)
|
43
|
+
case check
|
44
|
+
when Proc
|
45
|
+
check.call(node)
|
46
|
+
when Hash
|
47
|
+
check.all?{|attr, value|
|
48
|
+
node.respond_to?(attr) && value === node.send(attr)
|
49
|
+
}
|
50
|
+
when Symbol
|
51
|
+
node.respond_to?(check) && node.send(check)
|
52
|
+
else
|
53
|
+
check === node
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Navigation
|
4
|
+
# See {Shortcuts::Node Shortcuts::Node} for everything!
|
5
|
+
module Shortcuts
|
6
|
+
# `Shortcuts::Node` module provides some convenience methods for
|
7
|
+
# most used lookups. It's not a rocket science (as you can see
|
8
|
+
# from methods code), yet should make your code cleaner and
|
9
|
+
# more readable.
|
10
|
+
#
|
11
|
+
# **NB**: as usual, {Tree::Nodes} class have synonyms for all of
|
12
|
+
# those methods, so you can call them fearlessly on any results of
|
13
|
+
# node lookup.
|
14
|
+
#
|
15
|
+
module Node
|
16
|
+
# Returns all wikilinks inside current node.
|
17
|
+
#
|
18
|
+
# @param namespace from which namespace links do you want. It's
|
19
|
+
# `''` (main namespace only) by default, if you really want all
|
20
|
+
# wikilinks on the page, including categories, interwikies and
|
21
|
+
# stuff, use `wikilinks(nil)`
|
22
|
+
# @return {Tree::Nodes}
|
23
|
+
def wikilinks(namespace = '')
|
24
|
+
lookup(Tree::Wikilink, namespace: namespace)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns all headings inside current node.
|
28
|
+
#
|
29
|
+
# @param level headings level to return.
|
30
|
+
# @return {Tree::Nodes}
|
31
|
+
def headings(level = nil)
|
32
|
+
lookup(Tree::Heading, level: level)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns all paragraph-level nodes (list items, plain paragraphs,
|
36
|
+
# headings and so on) inside current node.
|
37
|
+
#
|
38
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
39
|
+
# @return {Tree::Nodes}
|
40
|
+
def paragraphs(*selectors, &block)
|
41
|
+
lookup(Tree::BaseParagraph, *selectors, &block)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns all external links inside current node.
|
45
|
+
#
|
46
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
47
|
+
# @return {Tree::Nodes}
|
48
|
+
def external_links(*selectors, &block)
|
49
|
+
lookup(Tree::ExternalLink, *selectors, &block)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns all images (media) inside current node.
|
53
|
+
#
|
54
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
55
|
+
# @return {Tree::Nodes}
|
56
|
+
def images(*selectors, &block)
|
57
|
+
lookup(Tree::Image, *selectors, &block)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns all templates inside current node.
|
61
|
+
#
|
62
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
63
|
+
# @return {Tree::Nodes}
|
64
|
+
def templates(*selectors, &block)
|
65
|
+
lookup(Tree::Template, *selectors, &block)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Returns all tables inside current node.
|
69
|
+
#
|
70
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
71
|
+
# @return {Tree::Nodes}
|
72
|
+
def tables(*selectors, &block)
|
73
|
+
lookup(Tree::Table, *selectors, &block)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns all lists (ordered/unordered/definition) inside current node.
|
77
|
+
#
|
78
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
79
|
+
# @return {Tree::Nodes}
|
80
|
+
def lists(*selectors, &block)
|
81
|
+
lookup(Tree::List, *selectors, &block)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Returns true, if current node is **inside** bold.
|
85
|
+
def bold?
|
86
|
+
has_parent?(Tree::Bold)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Returns true, if current node is **inside** italic.
|
90
|
+
def italic?
|
91
|
+
has_parent?(Tree::Italic)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Returns true, if current node is **inside** heading.
|
95
|
+
#
|
96
|
+
# @param level optional concrete level to check
|
97
|
+
def heading?(level = nil)
|
98
|
+
has_parent?(Tree::Heading, level: level)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns all infoboxes inside current node.
|
102
|
+
#
|
103
|
+
# Definition of what considered to be infobox depends on templates
|
104
|
+
# set used when parsing the page.
|
105
|
+
#
|
106
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
107
|
+
# @return {Tree::Nodes}
|
108
|
+
def infoboxes(*selectors, &block)
|
109
|
+
lookup(Tree::Template, :infobox?, *selectors, &block)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns all wikilinks in "categories namespace".
|
113
|
+
#
|
114
|
+
# **NB**: depending on your MediaWiki settings, name of categories
|
115
|
+
# namespace may vary. When you are using {MediaWiki#get}, Infoboxer
|
116
|
+
# tries to handle this transparently (by examining used wiki for
|
117
|
+
# category names), yet bad things may happen here.
|
118
|
+
#
|
119
|
+
# @return {Tree::Nodes}
|
120
|
+
def categories
|
121
|
+
lookup(Tree::Wikilink, namespace: /^#{ensure_traits.category_prefix.join('|')}$/)
|
122
|
+
end
|
123
|
+
|
124
|
+
# As users accustomed to have only one infobox on a page
|
125
|
+
alias_method :infobox, :infoboxes
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def ensure_traits
|
130
|
+
ensure_page.traits or fail("No site traits found")
|
131
|
+
end
|
132
|
+
|
133
|
+
def ensure_page
|
134
|
+
(is_a?(MediaWiki::Page) ? self : lookup_parents(MediaWiki::Page).first) or
|
135
|
+
fail("Node is not inside Page, maybe parsed from text?")
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Companion module of {Shortcuts::Node Shortcuts::Node}, defining
|
140
|
+
# all the same methods for {Tree::Nodes} so you can use them
|
141
|
+
# uniformely on single node or list. See {Shortcuts::Node there} for
|
142
|
+
# details.
|
143
|
+
module Nodes
|
144
|
+
# @!method wikilinks(namespace = '')
|
145
|
+
# @!method headings(level = nil)
|
146
|
+
# @!method paragraphs(*selectors, &block)
|
147
|
+
# @!method external_links(*selectors, &block)
|
148
|
+
# @!method images(*selectors, &block)
|
149
|
+
# @!method templates(*selectors, &block)
|
150
|
+
# @!method tables(*selectors, &block)
|
151
|
+
# @!method lists(*selectors, &block)
|
152
|
+
# @!method infoboxes(*selectors, &block)
|
153
|
+
# @!method categories
|
154
|
+
|
155
|
+
[:wikilinks, :headings, :paragraphs, :external_links, :images,
|
156
|
+
:templates, :tables, :lists, :infoboxes, :infobox, :categories].
|
157
|
+
each do |m|
|
158
|
+
define_method(m){|*args|
|
159
|
+
make_nodes map{|n| n.send(m, *args)}
|
160
|
+
}
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'ostruct'
|
3
|
+
require 'procme'
|
4
|
+
|
5
|
+
module Infoboxer
|
6
|
+
class Parser
|
7
|
+
class ParsingError < RuntimeError
|
8
|
+
end
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def inline(text, traits = nil)
|
12
|
+
new(context(text, traits)).inline
|
13
|
+
end
|
14
|
+
|
15
|
+
def paragraphs(text, traits = nil)
|
16
|
+
new(context(text, traits)).paragraphs
|
17
|
+
end
|
18
|
+
|
19
|
+
def paragraph(text, traits = nil)
|
20
|
+
paragraphs(text, traits).first
|
21
|
+
end
|
22
|
+
|
23
|
+
def document(text, traits = nil)
|
24
|
+
Tree::Document.new(paragraphs(text, traits))
|
25
|
+
end
|
26
|
+
|
27
|
+
def fragment(text, traits = nil)
|
28
|
+
new(context(text, traits)).long_inline
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def context(text, traits)
|
34
|
+
Context.new(text, coerce_traits(traits))
|
35
|
+
end
|
36
|
+
|
37
|
+
def coerce_traits(traits)
|
38
|
+
case traits
|
39
|
+
when nil
|
40
|
+
MediaWiki::Traits.default
|
41
|
+
when Hash
|
42
|
+
MediaWiki::Traits.new(traits)
|
43
|
+
when MediaWiki::Traits
|
44
|
+
traits
|
45
|
+
else
|
46
|
+
fail(ArgumentError, "Can't coerce site traits from #{traits.inspect}")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
include Tree
|
52
|
+
|
53
|
+
def initialize(context)
|
54
|
+
@context = context
|
55
|
+
@re = OpenStruct.new(make_regexps)
|
56
|
+
end
|
57
|
+
|
58
|
+
require_relative 'parser/inline'
|
59
|
+
include Parser::Inline
|
60
|
+
|
61
|
+
require_relative 'parser/paragraphs'
|
62
|
+
include Parser::Paragraphs
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
require_relative 'parser/util'
|
67
|
+
include Parser::Util
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
require_relative 'parser/context'
|