infoboxer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.dokaz +1 -0
- data/.yardopts +1 -0
- data/LICENSE.txt +22 -0
- data/Parsing.md +33 -0
- data/README.md +115 -0
- data/examples/output/.gitkeep +0 -0
- data/examples/pages/argentina.wiki +808 -0
- data/examples/to_text.rb +8 -0
- data/examples/tree.rb +8 -0
- data/infoboxer.gemspec +43 -0
- data/lib/infoboxer.rb +196 -0
- data/lib/infoboxer/core_ext.rb +10 -0
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
- data/lib/infoboxer/media_wiki.rb +162 -0
- data/lib/infoboxer/media_wiki/page.rb +38 -0
- data/lib/infoboxer/media_wiki/traits.rb +60 -0
- data/lib/infoboxer/navigation.rb +84 -0
- data/lib/infoboxer/navigation/lookup.rb +216 -0
- data/lib/infoboxer/navigation/sections.rb +179 -0
- data/lib/infoboxer/navigation/selector.rb +59 -0
- data/lib/infoboxer/navigation/shortcuts.rb +165 -0
- data/lib/infoboxer/parser.rb +71 -0
- data/lib/infoboxer/parser/context.rb +165 -0
- data/lib/infoboxer/parser/html.rb +58 -0
- data/lib/infoboxer/parser/image.rb +59 -0
- data/lib/infoboxer/parser/inline.rb +142 -0
- data/lib/infoboxer/parser/paragraphs.rb +66 -0
- data/lib/infoboxer/parser/table.rb +132 -0
- data/lib/infoboxer/parser/template.rb +47 -0
- data/lib/infoboxer/parser/util.rb +73 -0
- data/lib/infoboxer/templates.rb +10 -0
- data/lib/infoboxer/templates/base.rb +82 -0
- data/lib/infoboxer/templates/set.rb +72 -0
- data/lib/infoboxer/tree.rb +70 -0
- data/lib/infoboxer/tree/compound.rb +81 -0
- data/lib/infoboxer/tree/document.rb +11 -0
- data/lib/infoboxer/tree/html.rb +76 -0
- data/lib/infoboxer/tree/image.rb +53 -0
- data/lib/infoboxer/tree/inline.rb +39 -0
- data/lib/infoboxer/tree/list.rb +160 -0
- data/lib/infoboxer/tree/node.rb +181 -0
- data/lib/infoboxer/tree/nodes.rb +185 -0
- data/lib/infoboxer/tree/paragraphs.rb +122 -0
- data/lib/infoboxer/tree/ref.rb +34 -0
- data/lib/infoboxer/tree/table.rb +89 -0
- data/lib/infoboxer/tree/template.rb +82 -0
- data/lib/infoboxer/tree/text.rb +60 -0
- data/lib/infoboxer/tree/wikilink.rb +83 -0
- data/lib/infoboxer/version.rb +4 -0
- data/profile/out/.gitkeep +0 -0
- data/profile/pages/argentina.txt +808 -0
- data/profile/pages/canada.wiki +544 -0
- data/profile/pages/ukraine.wiki +1006 -0
- data/profile/pages/usa.wiki +843 -0
- data/regression/pages/canada.wiki +544 -0
- data/regression/pages/chiang_mai.wiki +2615 -0
- data/regression/pages/south_america.wiki +640 -0
- data/regression/pages/ukraine.wiki +1006 -0
- data/regression/pages/usa.wiki +843 -0
- metadata +272 -0
@@ -0,0 +1,179 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Navigation
|
4
|
+
# `Sections` module provides logical view on document strcture.
|
5
|
+
#
|
6
|
+
# From this module's point of view, each {Tree::Document Document} is a
|
7
|
+
# {Sections::Container Sections::Container}, which consists of
|
8
|
+
# {Sections::Container#intro} (before first heading) and a set of
|
9
|
+
# nested {Sections::Container#sections}.
|
10
|
+
#
|
11
|
+
# Each document node, in turn, provides method {Sections::Node#in_sections},
|
12
|
+
# allowing you to receive list of sections, which contains current
|
13
|
+
# node.
|
14
|
+
#
|
15
|
+
# **NB**: Sections are "virtual" nodes, they are not, in fact, in
|
16
|
+
# documents tree. So, you can be surprised with:
|
17
|
+
#
|
18
|
+
# ```ruby
|
19
|
+
# document.sections # => list of Section instances
|
20
|
+
# document.lookup(:Section) # => []
|
21
|
+
#
|
22
|
+
# paragraph.in_sections # => list of sections
|
23
|
+
# paragraph.
|
24
|
+
# lookup_parents(:Section) # => []
|
25
|
+
# ```
|
26
|
+
module Sections
|
27
|
+
# This module is included in {Tree::Document Document}, allowing
|
28
|
+
# you to navigate through document's logical sections (and also
|
29
|
+
# included in each {Sections::Section} instance, allowing to navigate
|
30
|
+
# recursively).
|
31
|
+
#
|
32
|
+
# See also {Sections parent module} docs.
|
33
|
+
module Container
|
34
|
+
# All container's paragraph-level nodes before first heading.
|
35
|
+
#
|
36
|
+
# @return {Tree::Nodes}
|
37
|
+
def intro
|
38
|
+
children.
|
39
|
+
take_while{|n| !n.is_a?(Tree::Heading)}.
|
40
|
+
select{|n| n.is_a?(Tree::BaseParagraph)}
|
41
|
+
end
|
42
|
+
|
43
|
+
# List of sections inside current container.
|
44
|
+
#
|
45
|
+
# Examples of usage:
|
46
|
+
#
|
47
|
+
# ```ruby
|
48
|
+
# document.sections # all top-level sections
|
49
|
+
# document.sections('Culture') # only "Culture" section
|
50
|
+
# document.sections(/^List of/) # all sections with heading matching pattern
|
51
|
+
#
|
52
|
+
# document.
|
53
|
+
# sections('Culture'). # long way of recieve nested section
|
54
|
+
# sections('Music') # (Culture / Music)
|
55
|
+
#
|
56
|
+
# document.
|
57
|
+
# sections('Culture', 'Music') # the same as above
|
58
|
+
#
|
59
|
+
# document.
|
60
|
+
# sections('Culture' => 'Music') # pretty-looking version for 2 levels of nesting
|
61
|
+
# ```
|
62
|
+
#
|
63
|
+
# @return {Tree::Nodes<Section>}
|
64
|
+
def sections(*names)
|
65
|
+
@sections ||= make_sections
|
66
|
+
|
67
|
+
if names.first.is_a?(Hash)
|
68
|
+
h = names.shift
|
69
|
+
h.count == 1 or fail(ArgumentError, "Undefined behavior with #{h}")
|
70
|
+
names.unshift(h.keys.first, h.values.first)
|
71
|
+
end
|
72
|
+
|
73
|
+
case names.count
|
74
|
+
when 0
|
75
|
+
@sections
|
76
|
+
when 1
|
77
|
+
@sections.select{|s| names.first === s.heading.text_}
|
78
|
+
else
|
79
|
+
@sections.select{|s| names.first === s.heading.text_}.sections(*names[1..-1])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def make_sections
|
86
|
+
res = Tree::Nodes[]
|
87
|
+
return res if headings.empty?
|
88
|
+
level = headings.first.level
|
89
|
+
|
90
|
+
children.
|
91
|
+
chunk{|n| n.matches?(Tree::Heading, level: level)}.
|
92
|
+
drop_while{|is_heading, nodes| !is_heading}.
|
93
|
+
each do |is_heading, nodes|
|
94
|
+
if is_heading
|
95
|
+
nodes.each do |node|
|
96
|
+
res << Section.new(node)
|
97
|
+
end
|
98
|
+
else
|
99
|
+
res.last.push_children(*nodes)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
res
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Part of {Sections} navigation, allowing each node to know exact
|
108
|
+
# list of sections it contained in.
|
109
|
+
#
|
110
|
+
# See also {Sections parent module} documentation.
|
111
|
+
module Node
|
112
|
+
# List of sections current node contained in (bottom-to-top:
|
113
|
+
# smallest section first).
|
114
|
+
#
|
115
|
+
# @return {Tree::Nodes<Section>}
|
116
|
+
def in_sections
|
117
|
+
main_node = parent.is_a?(Tree::Document) ? self : lookup_parents[-2]
|
118
|
+
|
119
|
+
heading = if main_node.is_a?(Tree::Heading)
|
120
|
+
main_node.lookup_prev_siblings(Tree::Heading, level: main_node.level - 1).last
|
121
|
+
else
|
122
|
+
main_node.lookup_prev_siblings(Tree::Heading).last
|
123
|
+
end
|
124
|
+
return Tree::Nodes[] unless heading
|
125
|
+
|
126
|
+
section = Section.new(heading,
|
127
|
+
heading.next_siblings.
|
128
|
+
take_while{|n| !n.is_a?(Tree::Heading) || n.level < heading.level}
|
129
|
+
)
|
130
|
+
Tree::Nodes[section, *heading.in_sections]
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Part of {Sections} navigation, allowing chains of section search.
|
135
|
+
#
|
136
|
+
# See {Sections parent module} documentation.
|
137
|
+
module Nodes
|
138
|
+
# @!method sections(*names)
|
139
|
+
# @!method in_sections
|
140
|
+
|
141
|
+
[:sections, :in_sections].each do |sym|
|
142
|
+
define_method(sym){|*args|
|
143
|
+
make_nodes map{|n| n.send(sym, *args)}
|
144
|
+
}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Virtual node, representing logical section of the document.
|
149
|
+
# Is not, in fact, in the tree.
|
150
|
+
#
|
151
|
+
# See {Sections parent module} documentation for details.
|
152
|
+
class Section < Tree::Compound
|
153
|
+
def initialize(heading, children = Tree::Nodes[])
|
154
|
+
# no super: we don't wont to rewriter children's parent
|
155
|
+
@children = Tree::Nodes[*children]
|
156
|
+
@heading = heading
|
157
|
+
end
|
158
|
+
|
159
|
+
# Section's heading.
|
160
|
+
#
|
161
|
+
# @return {Tree::Heading}
|
162
|
+
attr_reader :heading
|
163
|
+
|
164
|
+
# no rewriting of parent, again
|
165
|
+
def push_children(*nodes)
|
166
|
+
nodes.each do |n|
|
167
|
+
@children << n
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def empty?
|
172
|
+
false
|
173
|
+
end
|
174
|
+
|
175
|
+
include Container
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Navigation
|
4
|
+
module Lookup
|
5
|
+
# Incapsulates storage of selectors, used in {Lookup::Node node lookup}.
|
6
|
+
#
|
7
|
+
# See {Lookup::Node Lookup::Node} for detailed explanation of available selectors.
|
8
|
+
class Selector
|
9
|
+
include ProcMe
|
10
|
+
|
11
|
+
def initialize(*arg, &block)
|
12
|
+
@arg = [arg, block].flatten.compact.map(&method(:sym_to_class))
|
13
|
+
@arg.each do |a|
|
14
|
+
a.reject!{|k, v| v.nil?} if a.is_a?(Hash)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :arg
|
19
|
+
|
20
|
+
def ==(other)
|
21
|
+
self.class == other.class && arg == other.arg
|
22
|
+
end
|
23
|
+
|
24
|
+
def inspect
|
25
|
+
"#<Selector(#{@arg.map(&:to_s).join(', ')})>"
|
26
|
+
end
|
27
|
+
|
28
|
+
def matches?(node)
|
29
|
+
@arg.all?{|a| arg_matches?(a, node)}
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def sym_to_class(a)
|
35
|
+
if a.is_a?(Symbol) && a =~ /^[A-Z][a-zA-Z]+$/ && Tree.const_defined?(a)
|
36
|
+
Tree.const_get(a)
|
37
|
+
else
|
38
|
+
a
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def arg_matches?(check, node)
|
43
|
+
case check
|
44
|
+
when Proc
|
45
|
+
check.call(node)
|
46
|
+
when Hash
|
47
|
+
check.all?{|attr, value|
|
48
|
+
node.respond_to?(attr) && value === node.send(attr)
|
49
|
+
}
|
50
|
+
when Symbol
|
51
|
+
node.respond_to?(check) && node.send(check)
|
52
|
+
else
|
53
|
+
check === node
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
module Navigation
|
4
|
+
# See {Shortcuts::Node Shortcuts::Node} for everything!
|
5
|
+
module Shortcuts
|
6
|
+
# `Shortcuts::Node` module provides some convenience methods for
|
7
|
+
# most used lookups. It's not a rocket science (as you can see
|
8
|
+
# from methods code), yet should make your code cleaner and
|
9
|
+
# more readable.
|
10
|
+
#
|
11
|
+
# **NB**: as usual, {Tree::Nodes} class have synonyms for all of
|
12
|
+
# those methods, so you can call them fearlessly on any results of
|
13
|
+
# node lookup.
|
14
|
+
#
|
15
|
+
module Node
|
16
|
+
# Returns all wikilinks inside current node.
|
17
|
+
#
|
18
|
+
# @param namespace from which namespace links do you want. It's
|
19
|
+
# `''` (main namespace only) by default, if you really want all
|
20
|
+
# wikilinks on the page, including categories, interwikies and
|
21
|
+
# stuff, use `wikilinks(nil)`
|
22
|
+
# @return {Tree::Nodes}
|
23
|
+
def wikilinks(namespace = '')
|
24
|
+
lookup(Tree::Wikilink, namespace: namespace)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns all headings inside current node.
|
28
|
+
#
|
29
|
+
# @param level headings level to return.
|
30
|
+
# @return {Tree::Nodes}
|
31
|
+
def headings(level = nil)
|
32
|
+
lookup(Tree::Heading, level: level)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns all paragraph-level nodes (list items, plain paragraphs,
|
36
|
+
# headings and so on) inside current node.
|
37
|
+
#
|
38
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
39
|
+
# @return {Tree::Nodes}
|
40
|
+
def paragraphs(*selectors, &block)
|
41
|
+
lookup(Tree::BaseParagraph, *selectors, &block)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns all external links inside current node.
|
45
|
+
#
|
46
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
47
|
+
# @return {Tree::Nodes}
|
48
|
+
def external_links(*selectors, &block)
|
49
|
+
lookup(Tree::ExternalLink, *selectors, &block)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns all images (media) inside current node.
|
53
|
+
#
|
54
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
55
|
+
# @return {Tree::Nodes}
|
56
|
+
def images(*selectors, &block)
|
57
|
+
lookup(Tree::Image, *selectors, &block)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns all templates inside current node.
|
61
|
+
#
|
62
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
63
|
+
# @return {Tree::Nodes}
|
64
|
+
def templates(*selectors, &block)
|
65
|
+
lookup(Tree::Template, *selectors, &block)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Returns all tables inside current node.
|
69
|
+
#
|
70
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
71
|
+
# @return {Tree::Nodes}
|
72
|
+
def tables(*selectors, &block)
|
73
|
+
lookup(Tree::Table, *selectors, &block)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns all lists (ordered/unordered/definition) inside current node.
|
77
|
+
#
|
78
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
79
|
+
# @return {Tree::Nodes}
|
80
|
+
def lists(*selectors, &block)
|
81
|
+
lookup(Tree::List, *selectors, &block)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Returns true, if current node is **inside** bold.
|
85
|
+
def bold?
|
86
|
+
has_parent?(Tree::Bold)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Returns true, if current node is **inside** italic.
|
90
|
+
def italic?
|
91
|
+
has_parent?(Tree::Italic)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Returns true, if current node is **inside** heading.
|
95
|
+
#
|
96
|
+
# @param level optional concrete level to check
|
97
|
+
def heading?(level = nil)
|
98
|
+
has_parent?(Tree::Heading, level: level)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns all infoboxes inside current node.
|
102
|
+
#
|
103
|
+
# Definition of what considered to be infobox depends on templates
|
104
|
+
# set used when parsing the page.
|
105
|
+
#
|
106
|
+
# @param selectors node selectors, as described at {Lookup::Node}
|
107
|
+
# @return {Tree::Nodes}
|
108
|
+
def infoboxes(*selectors, &block)
|
109
|
+
lookup(Tree::Template, :infobox?, *selectors, &block)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns all wikilinks in "categories namespace".
|
113
|
+
#
|
114
|
+
# **NB**: depending on your MediaWiki settings, name of categories
|
115
|
+
# namespace may vary. When you are using {MediaWiki#get}, Infoboxer
|
116
|
+
# tries to handle this transparently (by examining used wiki for
|
117
|
+
# category names), yet bad things may happen here.
|
118
|
+
#
|
119
|
+
# @return {Tree::Nodes}
|
120
|
+
def categories
|
121
|
+
lookup(Tree::Wikilink, namespace: /^#{ensure_traits.category_prefix.join('|')}$/)
|
122
|
+
end
|
123
|
+
|
124
|
+
# As users accustomed to have only one infobox on a page
|
125
|
+
alias_method :infobox, :infoboxes
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def ensure_traits
|
130
|
+
ensure_page.traits or fail("No site traits found")
|
131
|
+
end
|
132
|
+
|
133
|
+
def ensure_page
|
134
|
+
(is_a?(MediaWiki::Page) ? self : lookup_parents(MediaWiki::Page).first) or
|
135
|
+
fail("Node is not inside Page, maybe parsed from text?")
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Companion module of {Shortcuts::Node Shortcuts::Node}, defining
|
140
|
+
# all the same methods for {Tree::Nodes} so you can use them
|
141
|
+
# uniformely on single node or list. See {Shortcuts::Node there} for
|
142
|
+
# details.
|
143
|
+
module Nodes
|
144
|
+
# @!method wikilinks(namespace = '')
|
145
|
+
# @!method headings(level = nil)
|
146
|
+
# @!method paragraphs(*selectors, &block)
|
147
|
+
# @!method external_links(*selectors, &block)
|
148
|
+
# @!method images(*selectors, &block)
|
149
|
+
# @!method templates(*selectors, &block)
|
150
|
+
# @!method tables(*selectors, &block)
|
151
|
+
# @!method lists(*selectors, &block)
|
152
|
+
# @!method infoboxes(*selectors, &block)
|
153
|
+
# @!method categories
|
154
|
+
|
155
|
+
[:wikilinks, :headings, :paragraphs, :external_links, :images,
|
156
|
+
:templates, :tables, :lists, :infoboxes, :infobox, :categories].
|
157
|
+
each do |m|
|
158
|
+
define_method(m){|*args|
|
159
|
+
make_nodes map{|n| n.send(m, *args)}
|
160
|
+
}
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'ostruct'
|
3
|
+
require 'procme'
|
4
|
+
|
5
|
+
module Infoboxer
|
6
|
+
class Parser
|
7
|
+
class ParsingError < RuntimeError
|
8
|
+
end
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def inline(text, traits = nil)
|
12
|
+
new(context(text, traits)).inline
|
13
|
+
end
|
14
|
+
|
15
|
+
def paragraphs(text, traits = nil)
|
16
|
+
new(context(text, traits)).paragraphs
|
17
|
+
end
|
18
|
+
|
19
|
+
def paragraph(text, traits = nil)
|
20
|
+
paragraphs(text, traits).first
|
21
|
+
end
|
22
|
+
|
23
|
+
def document(text, traits = nil)
|
24
|
+
Tree::Document.new(paragraphs(text, traits))
|
25
|
+
end
|
26
|
+
|
27
|
+
def fragment(text, traits = nil)
|
28
|
+
new(context(text, traits)).long_inline
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def context(text, traits)
|
34
|
+
Context.new(text, coerce_traits(traits))
|
35
|
+
end
|
36
|
+
|
37
|
+
def coerce_traits(traits)
|
38
|
+
case traits
|
39
|
+
when nil
|
40
|
+
MediaWiki::Traits.default
|
41
|
+
when Hash
|
42
|
+
MediaWiki::Traits.new(traits)
|
43
|
+
when MediaWiki::Traits
|
44
|
+
traits
|
45
|
+
else
|
46
|
+
fail(ArgumentError, "Can't coerce site traits from #{traits.inspect}")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
include Tree
|
52
|
+
|
53
|
+
def initialize(context)
|
54
|
+
@context = context
|
55
|
+
@re = OpenStruct.new(make_regexps)
|
56
|
+
end
|
57
|
+
|
58
|
+
require_relative 'parser/inline'
|
59
|
+
include Parser::Inline
|
60
|
+
|
61
|
+
require_relative 'parser/paragraphs'
|
62
|
+
include Parser::Paragraphs
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
require_relative 'parser/util'
|
67
|
+
include Parser::Util
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
require_relative 'parser/context'
|