infoboxer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,179 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Navigation
4
+ # `Sections` module provides logical view on document strcture.
5
+ #
6
+ # From this module's point of view, each {Tree::Document Document} is a
7
+ # {Sections::Container Sections::Container}, which consists of
8
+ # {Sections::Container#intro} (before first heading) and a set of
9
+ # nested {Sections::Container#sections}.
10
+ #
11
+ # Each document node, in turn, provides method {Sections::Node#in_sections},
12
+ # allowing you to receive list of sections, which contains current
13
+ # node.
14
+ #
15
+ # **NB**: Sections are "virtual" nodes, they are not, in fact, in
16
+ # documents tree. So, you can be surprised with:
17
+ #
18
+ # ```ruby
19
+ # document.sections # => list of Section instances
20
+ # document.lookup(:Section) # => []
21
+ #
22
+ # paragraph.in_sections # => list of sections
23
+ # paragraph.
24
+ # lookup_parents(:Section) # => []
25
+ # ```
26
+ module Sections
27
+ # This module is included in {Tree::Document Document}, allowing
28
+ # you to navigate through document's logical sections (and also
29
+ # included in each {Sections::Section} instance, allowing to navigate
30
+ # recursively).
31
+ #
32
+ # See also {Sections parent module} docs.
33
+ module Container
34
+ # All container's paragraph-level nodes before first heading.
35
+ #
36
+ # @return {Tree::Nodes}
37
+ def intro
38
+ children.
39
+ take_while{|n| !n.is_a?(Tree::Heading)}.
40
+ select{|n| n.is_a?(Tree::BaseParagraph)}
41
+ end
42
+
43
+ # List of sections inside current container.
44
+ #
45
+ # Examples of usage:
46
+ #
47
+ # ```ruby
48
+ # document.sections # all top-level sections
49
+ # document.sections('Culture') # only "Culture" section
50
+ # document.sections(/^List of/) # all sections with heading matching pattern
51
+ #
52
+ # document.
53
+ # sections('Culture'). # long way of recieve nested section
54
+ # sections('Music') # (Culture / Music)
55
+ #
56
+ # document.
57
+ # sections('Culture', 'Music') # the same as above
58
+ #
59
+ # document.
60
+ # sections('Culture' => 'Music') # pretty-looking version for 2 levels of nesting
61
+ # ```
62
+ #
63
+ # @return {Tree::Nodes<Section>}
64
+ def sections(*names)
65
+ @sections ||= make_sections
66
+
67
+ if names.first.is_a?(Hash)
68
+ h = names.shift
69
+ h.count == 1 or fail(ArgumentError, "Undefined behavior with #{h}")
70
+ names.unshift(h.keys.first, h.values.first)
71
+ end
72
+
73
+ case names.count
74
+ when 0
75
+ @sections
76
+ when 1
77
+ @sections.select{|s| names.first === s.heading.text_}
78
+ else
79
+ @sections.select{|s| names.first === s.heading.text_}.sections(*names[1..-1])
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ def make_sections
86
+ res = Tree::Nodes[]
87
+ return res if headings.empty?
88
+ level = headings.first.level
89
+
90
+ children.
91
+ chunk{|n| n.matches?(Tree::Heading, level: level)}.
92
+ drop_while{|is_heading, nodes| !is_heading}.
93
+ each do |is_heading, nodes|
94
+ if is_heading
95
+ nodes.each do |node|
96
+ res << Section.new(node)
97
+ end
98
+ else
99
+ res.last.push_children(*nodes)
100
+ end
101
+ end
102
+
103
+ res
104
+ end
105
+ end
106
+
107
+ # Part of {Sections} navigation, allowing each node to know exact
108
+ # list of sections it contained in.
109
+ #
110
+ # See also {Sections parent module} documentation.
111
+ module Node
112
+ # List of sections current node contained in (bottom-to-top:
113
+ # smallest section first).
114
+ #
115
+ # @return {Tree::Nodes<Section>}
116
+ def in_sections
117
+ main_node = parent.is_a?(Tree::Document) ? self : lookup_parents[-2]
118
+
119
+ heading = if main_node.is_a?(Tree::Heading)
120
+ main_node.lookup_prev_siblings(Tree::Heading, level: main_node.level - 1).last
121
+ else
122
+ main_node.lookup_prev_siblings(Tree::Heading).last
123
+ end
124
+ return Tree::Nodes[] unless heading
125
+
126
+ section = Section.new(heading,
127
+ heading.next_siblings.
128
+ take_while{|n| !n.is_a?(Tree::Heading) || n.level < heading.level}
129
+ )
130
+ Tree::Nodes[section, *heading.in_sections]
131
+ end
132
+ end
133
+
134
+ # Part of {Sections} navigation, allowing chains of section search.
135
+ #
136
+ # See {Sections parent module} documentation.
137
+ module Nodes
138
+ # @!method sections(*names)
139
+ # @!method in_sections
140
+
141
+ [:sections, :in_sections].each do |sym|
142
+ define_method(sym){|*args|
143
+ make_nodes map{|n| n.send(sym, *args)}
144
+ }
145
+ end
146
+ end
147
+
148
+ # Virtual node, representing logical section of the document.
149
+ # Is not, in fact, in the tree.
150
+ #
151
+ # See {Sections parent module} documentation for details.
152
+ class Section < Tree::Compound
153
+ def initialize(heading, children = Tree::Nodes[])
154
+ # no super: we don't wont to rewriter children's parent
155
+ @children = Tree::Nodes[*children]
156
+ @heading = heading
157
+ end
158
+
159
+ # Section's heading.
160
+ #
161
+ # @return {Tree::Heading}
162
+ attr_reader :heading
163
+
164
+ # no rewriting of parent, again
165
+ def push_children(*nodes)
166
+ nodes.each do |n|
167
+ @children << n
168
+ end
169
+ end
170
+
171
+ def empty?
172
+ false
173
+ end
174
+
175
+ include Container
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Navigation
4
+ module Lookup
5
+ # Incapsulates storage of selectors, used in {Lookup::Node node lookup}.
6
+ #
7
+ # See {Lookup::Node Lookup::Node} for detailed explanation of available selectors.
8
+ class Selector
9
+ include ProcMe
10
+
11
+ def initialize(*arg, &block)
12
+ @arg = [arg, block].flatten.compact.map(&method(:sym_to_class))
13
+ @arg.each do |a|
14
+ a.reject!{|k, v| v.nil?} if a.is_a?(Hash)
15
+ end
16
+ end
17
+
18
+ attr_reader :arg
19
+
20
+ def ==(other)
21
+ self.class == other.class && arg == other.arg
22
+ end
23
+
24
+ def inspect
25
+ "#<Selector(#{@arg.map(&:to_s).join(', ')})>"
26
+ end
27
+
28
+ def matches?(node)
29
+ @arg.all?{|a| arg_matches?(a, node)}
30
+ end
31
+
32
+ private
33
+
34
+ def sym_to_class(a)
35
+ if a.is_a?(Symbol) && a =~ /^[A-Z][a-zA-Z]+$/ && Tree.const_defined?(a)
36
+ Tree.const_get(a)
37
+ else
38
+ a
39
+ end
40
+ end
41
+
42
+ def arg_matches?(check, node)
43
+ case check
44
+ when Proc
45
+ check.call(node)
46
+ when Hash
47
+ check.all?{|attr, value|
48
+ node.respond_to?(attr) && value === node.send(attr)
49
+ }
50
+ when Symbol
51
+ node.respond_to?(check) && node.send(check)
52
+ else
53
+ check === node
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,165 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Navigation
4
+ # See {Shortcuts::Node Shortcuts::Node} for everything!
5
+ module Shortcuts
6
+ # `Shortcuts::Node` module provides some convenience methods for
7
+ # most used lookups. It's not a rocket science (as you can see
8
+ # from methods code), yet should make your code cleaner and
9
+ # more readable.
10
+ #
11
+ # **NB**: as usual, {Tree::Nodes} class have synonyms for all of
12
+ # those methods, so you can call them fearlessly on any results of
13
+ # node lookup.
14
+ #
15
+ module Node
16
+ # Returns all wikilinks inside current node.
17
+ #
18
+ # @param namespace from which namespace links do you want. It's
19
+ # `''` (main namespace only) by default, if you really want all
20
+ # wikilinks on the page, including categories, interwikies and
21
+ # stuff, use `wikilinks(nil)`
22
+ # @return {Tree::Nodes}
23
+ def wikilinks(namespace = '')
24
+ lookup(Tree::Wikilink, namespace: namespace)
25
+ end
26
+
27
+ # Returns all headings inside current node.
28
+ #
29
+ # @param level headings level to return.
30
+ # @return {Tree::Nodes}
31
+ def headings(level = nil)
32
+ lookup(Tree::Heading, level: level)
33
+ end
34
+
35
+ # Returns all paragraph-level nodes (list items, plain paragraphs,
36
+ # headings and so on) inside current node.
37
+ #
38
+ # @param selectors node selectors, as described at {Lookup::Node}
39
+ # @return {Tree::Nodes}
40
+ def paragraphs(*selectors, &block)
41
+ lookup(Tree::BaseParagraph, *selectors, &block)
42
+ end
43
+
44
+ # Returns all external links inside current node.
45
+ #
46
+ # @param selectors node selectors, as described at {Lookup::Node}
47
+ # @return {Tree::Nodes}
48
+ def external_links(*selectors, &block)
49
+ lookup(Tree::ExternalLink, *selectors, &block)
50
+ end
51
+
52
+ # Returns all images (media) inside current node.
53
+ #
54
+ # @param selectors node selectors, as described at {Lookup::Node}
55
+ # @return {Tree::Nodes}
56
+ def images(*selectors, &block)
57
+ lookup(Tree::Image, *selectors, &block)
58
+ end
59
+
60
+ # Returns all templates inside current node.
61
+ #
62
+ # @param selectors node selectors, as described at {Lookup::Node}
63
+ # @return {Tree::Nodes}
64
+ def templates(*selectors, &block)
65
+ lookup(Tree::Template, *selectors, &block)
66
+ end
67
+
68
+ # Returns all tables inside current node.
69
+ #
70
+ # @param selectors node selectors, as described at {Lookup::Node}
71
+ # @return {Tree::Nodes}
72
+ def tables(*selectors, &block)
73
+ lookup(Tree::Table, *selectors, &block)
74
+ end
75
+
76
+ # Returns all lists (ordered/unordered/definition) inside current node.
77
+ #
78
+ # @param selectors node selectors, as described at {Lookup::Node}
79
+ # @return {Tree::Nodes}
80
+ def lists(*selectors, &block)
81
+ lookup(Tree::List, *selectors, &block)
82
+ end
83
+
84
+ # Returns true, if current node is **inside** bold.
85
+ def bold?
86
+ has_parent?(Tree::Bold)
87
+ end
88
+
89
+ # Returns true, if current node is **inside** italic.
90
+ def italic?
91
+ has_parent?(Tree::Italic)
92
+ end
93
+
94
+ # Returns true, if current node is **inside** heading.
95
+ #
96
+ # @param level optional concrete level to check
97
+ def heading?(level = nil)
98
+ has_parent?(Tree::Heading, level: level)
99
+ end
100
+
101
+ # Returns all infoboxes inside current node.
102
+ #
103
+ # Definition of what considered to be infobox depends on templates
104
+ # set used when parsing the page.
105
+ #
106
+ # @param selectors node selectors, as described at {Lookup::Node}
107
+ # @return {Tree::Nodes}
108
+ def infoboxes(*selectors, &block)
109
+ lookup(Tree::Template, :infobox?, *selectors, &block)
110
+ end
111
+
112
+ # Returns all wikilinks in "categories namespace".
113
+ #
114
+ # **NB**: depending on your MediaWiki settings, name of categories
115
+ # namespace may vary. When you are using {MediaWiki#get}, Infoboxer
116
+ # tries to handle this transparently (by examining used wiki for
117
+ # category names), yet bad things may happen here.
118
+ #
119
+ # @return {Tree::Nodes}
120
+ def categories
121
+ lookup(Tree::Wikilink, namespace: /^#{ensure_traits.category_prefix.join('|')}$/)
122
+ end
123
+
124
+ # As users accustomed to have only one infobox on a page
125
+ alias_method :infobox, :infoboxes
126
+
127
+ private
128
+
129
+ def ensure_traits
130
+ ensure_page.traits or fail("No site traits found")
131
+ end
132
+
133
+ def ensure_page
134
+ (is_a?(MediaWiki::Page) ? self : lookup_parents(MediaWiki::Page).first) or
135
+ fail("Node is not inside Page, maybe parsed from text?")
136
+ end
137
+ end
138
+
139
+ # Companion module of {Shortcuts::Node Shortcuts::Node}, defining
140
+ # all the same methods for {Tree::Nodes} so you can use them
141
+ # uniformely on single node or list. See {Shortcuts::Node there} for
142
+ # details.
143
+ module Nodes
144
+ # @!method wikilinks(namespace = '')
145
+ # @!method headings(level = nil)
146
+ # @!method paragraphs(*selectors, &block)
147
+ # @!method external_links(*selectors, &block)
148
+ # @!method images(*selectors, &block)
149
+ # @!method templates(*selectors, &block)
150
+ # @!method tables(*selectors, &block)
151
+ # @!method lists(*selectors, &block)
152
+ # @!method infoboxes(*selectors, &block)
153
+ # @!method categories
154
+
155
+ [:wikilinks, :headings, :paragraphs, :external_links, :images,
156
+ :templates, :tables, :lists, :infoboxes, :infobox, :categories].
157
+ each do |m|
158
+ define_method(m){|*args|
159
+ make_nodes map{|n| n.send(m, *args)}
160
+ }
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,71 @@
1
+ # encoding: utf-8
2
+ require 'ostruct'
3
+ require 'procme'
4
+
5
+ module Infoboxer
6
+ class Parser
7
+ class ParsingError < RuntimeError
8
+ end
9
+
10
+ class << self
11
+ def inline(text, traits = nil)
12
+ new(context(text, traits)).inline
13
+ end
14
+
15
+ def paragraphs(text, traits = nil)
16
+ new(context(text, traits)).paragraphs
17
+ end
18
+
19
+ def paragraph(text, traits = nil)
20
+ paragraphs(text, traits).first
21
+ end
22
+
23
+ def document(text, traits = nil)
24
+ Tree::Document.new(paragraphs(text, traits))
25
+ end
26
+
27
+ def fragment(text, traits = nil)
28
+ new(context(text, traits)).long_inline
29
+ end
30
+
31
+ private
32
+
33
+ def context(text, traits)
34
+ Context.new(text, coerce_traits(traits))
35
+ end
36
+
37
+ def coerce_traits(traits)
38
+ case traits
39
+ when nil
40
+ MediaWiki::Traits.default
41
+ when Hash
42
+ MediaWiki::Traits.new(traits)
43
+ when MediaWiki::Traits
44
+ traits
45
+ else
46
+ fail(ArgumentError, "Can't coerce site traits from #{traits.inspect}")
47
+ end
48
+ end
49
+ end
50
+
51
+ include Tree
52
+
53
+ def initialize(context)
54
+ @context = context
55
+ @re = OpenStruct.new(make_regexps)
56
+ end
57
+
58
+ require_relative 'parser/inline'
59
+ include Parser::Inline
60
+
61
+ require_relative 'parser/paragraphs'
62
+ include Parser::Paragraphs
63
+
64
+ private
65
+
66
+ require_relative 'parser/util'
67
+ include Parser::Util
68
+ end
69
+ end
70
+
71
+ require_relative 'parser/context'