infoboxer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,179 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Navigation
4
+ # `Sections` module provides logical view on document strcture.
5
+ #
6
+ # From this module's point of view, each {Tree::Document Document} is a
7
+ # {Sections::Container Sections::Container}, which consists of
8
+ # {Sections::Container#intro} (before first heading) and a set of
9
+ # nested {Sections::Container#sections}.
10
+ #
11
+ # Each document node, in turn, provides method {Sections::Node#in_sections},
12
+ # allowing you to receive list of sections, which contains current
13
+ # node.
14
+ #
15
+ # **NB**: Sections are "virtual" nodes, they are not, in fact, in
16
+ # documents tree. So, you can be surprised with:
17
+ #
18
+ # ```ruby
19
+ # document.sections # => list of Section instances
20
+ # document.lookup(:Section) # => []
21
+ #
22
+ # paragraph.in_sections # => list of sections
23
+ # paragraph.
24
+ # lookup_parents(:Section) # => []
25
+ # ```
26
+ module Sections
27
+ # This module is included in {Tree::Document Document}, allowing
28
+ # you to navigate through document's logical sections (and also
29
+ # included in each {Sections::Section} instance, allowing to navigate
30
+ # recursively).
31
+ #
32
+ # See also {Sections parent module} docs.
33
+ module Container
34
+ # All container's paragraph-level nodes before first heading.
35
+ #
36
+ # @return {Tree::Nodes}
37
+ def intro
38
+ children.
39
+ take_while{|n| !n.is_a?(Tree::Heading)}.
40
+ select{|n| n.is_a?(Tree::BaseParagraph)}
41
+ end
42
+
43
+ # List of sections inside current container.
44
+ #
45
+ # Examples of usage:
46
+ #
47
+ # ```ruby
48
+ # document.sections # all top-level sections
49
+ # document.sections('Culture') # only "Culture" section
50
+ # document.sections(/^List of/) # all sections with heading matching pattern
51
+ #
52
+ # document.
53
+ # sections('Culture'). # long way of recieve nested section
54
+ # sections('Music') # (Culture / Music)
55
+ #
56
+ # document.
57
+ # sections('Culture', 'Music') # the same as above
58
+ #
59
+ # document.
60
+ # sections('Culture' => 'Music') # pretty-looking version for 2 levels of nesting
61
+ # ```
62
+ #
63
+ # @return {Tree::Nodes<Section>}
64
+ def sections(*names)
65
+ @sections ||= make_sections
66
+
67
+ if names.first.is_a?(Hash)
68
+ h = names.shift
69
+ h.count == 1 or fail(ArgumentError, "Undefined behavior with #{h}")
70
+ names.unshift(h.keys.first, h.values.first)
71
+ end
72
+
73
+ case names.count
74
+ when 0
75
+ @sections
76
+ when 1
77
+ @sections.select{|s| names.first === s.heading.text_}
78
+ else
79
+ @sections.select{|s| names.first === s.heading.text_}.sections(*names[1..-1])
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ def make_sections
86
+ res = Tree::Nodes[]
87
+ return res if headings.empty?
88
+ level = headings.first.level
89
+
90
+ children.
91
+ chunk{|n| n.matches?(Tree::Heading, level: level)}.
92
+ drop_while{|is_heading, nodes| !is_heading}.
93
+ each do |is_heading, nodes|
94
+ if is_heading
95
+ nodes.each do |node|
96
+ res << Section.new(node)
97
+ end
98
+ else
99
+ res.last.push_children(*nodes)
100
+ end
101
+ end
102
+
103
+ res
104
+ end
105
+ end
106
+
107
+ # Part of {Sections} navigation, allowing each node to know exact
108
+ # list of sections it contained in.
109
+ #
110
+ # See also {Sections parent module} documentation.
111
+ module Node
112
+ # List of sections current node contained in (bottom-to-top:
113
+ # smallest section first).
114
+ #
115
+ # @return {Tree::Nodes<Section>}
116
+ def in_sections
117
+ main_node = parent.is_a?(Tree::Document) ? self : lookup_parents[-2]
118
+
119
+ heading = if main_node.is_a?(Tree::Heading)
120
+ main_node.lookup_prev_siblings(Tree::Heading, level: main_node.level - 1).last
121
+ else
122
+ main_node.lookup_prev_siblings(Tree::Heading).last
123
+ end
124
+ return Tree::Nodes[] unless heading
125
+
126
+ section = Section.new(heading,
127
+ heading.next_siblings.
128
+ take_while{|n| !n.is_a?(Tree::Heading) || n.level < heading.level}
129
+ )
130
+ Tree::Nodes[section, *heading.in_sections]
131
+ end
132
+ end
133
+
134
+ # Part of {Sections} navigation, allowing chains of section search.
135
+ #
136
+ # See {Sections parent module} documentation.
137
+ module Nodes
138
+ # @!method sections(*names)
139
+ # @!method in_sections
140
+
141
+ [:sections, :in_sections].each do |sym|
142
+ define_method(sym){|*args|
143
+ make_nodes map{|n| n.send(sym, *args)}
144
+ }
145
+ end
146
+ end
147
+
148
+ # Virtual node, representing logical section of the document.
149
+ # Is not, in fact, in the tree.
150
+ #
151
+ # See {Sections parent module} documentation for details.
152
+ class Section < Tree::Compound
153
+ def initialize(heading, children = Tree::Nodes[])
154
+ # no super: we don't wont to rewriter children's parent
155
+ @children = Tree::Nodes[*children]
156
+ @heading = heading
157
+ end
158
+
159
+ # Section's heading.
160
+ #
161
+ # @return {Tree::Heading}
162
+ attr_reader :heading
163
+
164
+ # no rewriting of parent, again
165
+ def push_children(*nodes)
166
+ nodes.each do |n|
167
+ @children << n
168
+ end
169
+ end
170
+
171
+ def empty?
172
+ false
173
+ end
174
+
175
+ include Container
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Navigation
4
+ module Lookup
5
+ # Incapsulates storage of selectors, used in {Lookup::Node node lookup}.
6
+ #
7
+ # See {Lookup::Node Lookup::Node} for detailed explanation of available selectors.
8
+ class Selector
9
+ include ProcMe
10
+
11
+ def initialize(*arg, &block)
12
+ @arg = [arg, block].flatten.compact.map(&method(:sym_to_class))
13
+ @arg.each do |a|
14
+ a.reject!{|k, v| v.nil?} if a.is_a?(Hash)
15
+ end
16
+ end
17
+
18
+ attr_reader :arg
19
+
20
+ def ==(other)
21
+ self.class == other.class && arg == other.arg
22
+ end
23
+
24
+ def inspect
25
+ "#<Selector(#{@arg.map(&:to_s).join(', ')})>"
26
+ end
27
+
28
+ def matches?(node)
29
+ @arg.all?{|a| arg_matches?(a, node)}
30
+ end
31
+
32
+ private
33
+
34
+ def sym_to_class(a)
35
+ if a.is_a?(Symbol) && a =~ /^[A-Z][a-zA-Z]+$/ && Tree.const_defined?(a)
36
+ Tree.const_get(a)
37
+ else
38
+ a
39
+ end
40
+ end
41
+
42
+ def arg_matches?(check, node)
43
+ case check
44
+ when Proc
45
+ check.call(node)
46
+ when Hash
47
+ check.all?{|attr, value|
48
+ node.respond_to?(attr) && value === node.send(attr)
49
+ }
50
+ when Symbol
51
+ node.respond_to?(check) && node.send(check)
52
+ else
53
+ check === node
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,165 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Navigation
4
+ # See {Shortcuts::Node Shortcuts::Node} for everything!
5
+ module Shortcuts
6
+ # `Shortcuts::Node` module provides some convenience methods for
7
+ # most used lookups. It's not a rocket science (as you can see
8
+ # from methods code), yet should make your code cleaner and
9
+ # more readable.
10
+ #
11
+ # **NB**: as usual, {Tree::Nodes} class have synonyms for all of
12
+ # those methods, so you can call them fearlessly on any results of
13
+ # node lookup.
14
+ #
15
+ module Node
16
+ # Returns all wikilinks inside current node.
17
+ #
18
+ # @param namespace from which namespace links do you want. It's
19
+ # `''` (main namespace only) by default, if you really want all
20
+ # wikilinks on the page, including categories, interwikies and
21
+ # stuff, use `wikilinks(nil)`
22
+ # @return {Tree::Nodes}
23
+ def wikilinks(namespace = '')
24
+ lookup(Tree::Wikilink, namespace: namespace)
25
+ end
26
+
27
+ # Returns all headings inside current node.
28
+ #
29
+ # @param level headings level to return.
30
+ # @return {Tree::Nodes}
31
+ def headings(level = nil)
32
+ lookup(Tree::Heading, level: level)
33
+ end
34
+
35
+ # Returns all paragraph-level nodes (list items, plain paragraphs,
36
+ # headings and so on) inside current node.
37
+ #
38
+ # @param selectors node selectors, as described at {Lookup::Node}
39
+ # @return {Tree::Nodes}
40
+ def paragraphs(*selectors, &block)
41
+ lookup(Tree::BaseParagraph, *selectors, &block)
42
+ end
43
+
44
+ # Returns all external links inside current node.
45
+ #
46
+ # @param selectors node selectors, as described at {Lookup::Node}
47
+ # @return {Tree::Nodes}
48
+ def external_links(*selectors, &block)
49
+ lookup(Tree::ExternalLink, *selectors, &block)
50
+ end
51
+
52
+ # Returns all images (media) inside current node.
53
+ #
54
+ # @param selectors node selectors, as described at {Lookup::Node}
55
+ # @return {Tree::Nodes}
56
+ def images(*selectors, &block)
57
+ lookup(Tree::Image, *selectors, &block)
58
+ end
59
+
60
+ # Returns all templates inside current node.
61
+ #
62
+ # @param selectors node selectors, as described at {Lookup::Node}
63
+ # @return {Tree::Nodes}
64
+ def templates(*selectors, &block)
65
+ lookup(Tree::Template, *selectors, &block)
66
+ end
67
+
68
+ # Returns all tables inside current node.
69
+ #
70
+ # @param selectors node selectors, as described at {Lookup::Node}
71
+ # @return {Tree::Nodes}
72
+ def tables(*selectors, &block)
73
+ lookup(Tree::Table, *selectors, &block)
74
+ end
75
+
76
+ # Returns all lists (ordered/unordered/definition) inside current node.
77
+ #
78
+ # @param selectors node selectors, as described at {Lookup::Node}
79
+ # @return {Tree::Nodes}
80
+ def lists(*selectors, &block)
81
+ lookup(Tree::List, *selectors, &block)
82
+ end
83
+
84
+ # Returns true, if current node is **inside** bold.
85
+ def bold?
86
+ has_parent?(Tree::Bold)
87
+ end
88
+
89
+ # Returns true, if current node is **inside** italic.
90
+ def italic?
91
+ has_parent?(Tree::Italic)
92
+ end
93
+
94
+ # Returns true, if current node is **inside** heading.
95
+ #
96
+ # @param level optional concrete level to check
97
+ def heading?(level = nil)
98
+ has_parent?(Tree::Heading, level: level)
99
+ end
100
+
101
+ # Returns all infoboxes inside current node.
102
+ #
103
+ # Definition of what considered to be infobox depends on templates
104
+ # set used when parsing the page.
105
+ #
106
+ # @param selectors node selectors, as described at {Lookup::Node}
107
+ # @return {Tree::Nodes}
108
+ def infoboxes(*selectors, &block)
109
+ lookup(Tree::Template, :infobox?, *selectors, &block)
110
+ end
111
+
112
+ # Returns all wikilinks in "categories namespace".
113
+ #
114
+ # **NB**: depending on your MediaWiki settings, name of categories
115
+ # namespace may vary. When you are using {MediaWiki#get}, Infoboxer
116
+ # tries to handle this transparently (by examining used wiki for
117
+ # category names), yet bad things may happen here.
118
+ #
119
+ # @return {Tree::Nodes}
120
+ def categories
121
+ lookup(Tree::Wikilink, namespace: /^#{ensure_traits.category_prefix.join('|')}$/)
122
+ end
123
+
124
+ # As users accustomed to have only one infobox on a page
125
+ alias_method :infobox, :infoboxes
126
+
127
+ private
128
+
129
+ def ensure_traits
130
+ ensure_page.traits or fail("No site traits found")
131
+ end
132
+
133
+ def ensure_page
134
+ (is_a?(MediaWiki::Page) ? self : lookup_parents(MediaWiki::Page).first) or
135
+ fail("Node is not inside Page, maybe parsed from text?")
136
+ end
137
+ end
138
+
139
+ # Companion module of {Shortcuts::Node Shortcuts::Node}, defining
140
+ # all the same methods for {Tree::Nodes} so you can use them
141
+ # uniformely on single node or list. See {Shortcuts::Node there} for
142
+ # details.
143
+ module Nodes
144
+ # @!method wikilinks(namespace = '')
145
+ # @!method headings(level = nil)
146
+ # @!method paragraphs(*selectors, &block)
147
+ # @!method external_links(*selectors, &block)
148
+ # @!method images(*selectors, &block)
149
+ # @!method templates(*selectors, &block)
150
+ # @!method tables(*selectors, &block)
151
+ # @!method lists(*selectors, &block)
152
+ # @!method infoboxes(*selectors, &block)
153
+ # @!method categories
154
+
155
+ [:wikilinks, :headings, :paragraphs, :external_links, :images,
156
+ :templates, :tables, :lists, :infoboxes, :infobox, :categories].
157
+ each do |m|
158
+ define_method(m){|*args|
159
+ make_nodes map{|n| n.send(m, *args)}
160
+ }
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,71 @@
1
+ # encoding: utf-8
2
+ require 'ostruct'
3
+ require 'procme'
4
+
5
+ module Infoboxer
6
+ class Parser
7
+ class ParsingError < RuntimeError
8
+ end
9
+
10
+ class << self
11
+ def inline(text, traits = nil)
12
+ new(context(text, traits)).inline
13
+ end
14
+
15
+ def paragraphs(text, traits = nil)
16
+ new(context(text, traits)).paragraphs
17
+ end
18
+
19
+ def paragraph(text, traits = nil)
20
+ paragraphs(text, traits).first
21
+ end
22
+
23
+ def document(text, traits = nil)
24
+ Tree::Document.new(paragraphs(text, traits))
25
+ end
26
+
27
+ def fragment(text, traits = nil)
28
+ new(context(text, traits)).long_inline
29
+ end
30
+
31
+ private
32
+
33
+ def context(text, traits)
34
+ Context.new(text, coerce_traits(traits))
35
+ end
36
+
37
+ def coerce_traits(traits)
38
+ case traits
39
+ when nil
40
+ MediaWiki::Traits.default
41
+ when Hash
42
+ MediaWiki::Traits.new(traits)
43
+ when MediaWiki::Traits
44
+ traits
45
+ else
46
+ fail(ArgumentError, "Can't coerce site traits from #{traits.inspect}")
47
+ end
48
+ end
49
+ end
50
+
51
+ include Tree
52
+
53
+ def initialize(context)
54
+ @context = context
55
+ @re = OpenStruct.new(make_regexps)
56
+ end
57
+
58
+ require_relative 'parser/inline'
59
+ include Parser::Inline
60
+
61
+ require_relative 'parser/paragraphs'
62
+ include Parser::Paragraphs
63
+
64
+ private
65
+
66
+ require_relative 'parser/util'
67
+ include Parser::Util
68
+ end
69
+ end
70
+
71
+ require_relative 'parser/context'