infoboxer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,185 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # List of nodes, which tries to be useful both as array, and as proxy
5
+ # to its contents.
6
+ #
7
+ # Many of Infoboxer's methods (especially {Navigation}'s) return
8
+ # `Nodes`, and in most cases you don't have to think about it. Same
9
+ # approach can be seen in jQuery or Nokogiri. You just do things
10
+ # like those:
11
+ #
12
+ # ```ruby
13
+ # document.sections. # => Nodes returned,
14
+ # select{|section| # you can treat them as array, but also...
15
+ # section.text.length > 1000 #
16
+ # }. #
17
+ # lookup(:Wikilink, text: /Chile/). # ...use Infoboxer's methods
18
+ # follow. # ...even to receive lists of other pages
19
+ # infoboxes. # ...and use methods on them
20
+ # fetch('leader_name1'). # ...including those which only some node types support
21
+ # map(&:text) # ...and still have full-functioning Array
22
+ # ```
23
+ #
24
+ class Nodes < Array
25
+
26
+ # @!method select(&block)
27
+ # Just like Array#select, but returns Nodes
28
+
29
+ # @!method reject(&block)
30
+ # Just like Array#reject, but returns Nodes
31
+
32
+ # @!method sort_by(&block)
33
+ # Just like Array#sort_by, but returns Nodes
34
+
35
+ # @!method flatten
36
+ # Just like Array#flatten, but returns Nodes
37
+
38
+ # @!method compact
39
+ # Just like Array#compact, but returns Nodes
40
+
41
+ # @!method -(other)
42
+ # Just like Array#-, but returns Nodes
43
+
44
+ [:select, :reject, :sort_by, :flatten, :compact, :-].each do |sym|
45
+ define_method(sym){|*args, &block|
46
+ Nodes[*super(*args, &block)]
47
+ }
48
+ end
49
+
50
+ # Just like Array#first, but returns Nodes, if provided with `n` of elements.
51
+ def first(n = nil)
52
+ if n.nil?
53
+ super()
54
+ else
55
+ Nodes[*super(n)]
56
+ end
57
+ end
58
+
59
+ # Just like Array#last, but returns Nodes, if provided with `n` of elements.
60
+ def last(n = nil)
61
+ if n.nil?
62
+ super()
63
+ else
64
+ Nodes[*super(n)]
65
+ end
66
+ end
67
+
68
+ # Just like Array#map, but returns Nodes, **if** all map results are Node
69
+ def map
70
+ res = super
71
+ if res.all?{|n| n.is_a?(Node) || n.is_a?(Nodes)}
72
+ Nodes[*res]
73
+ else
74
+ res
75
+ end
76
+ end
77
+
78
+ # @!method prev_siblings
79
+ # Previous siblings (flat list) of all nodes inside.
80
+
81
+ # @!method next_siblings
82
+ # Next siblings (flat list) of all nodes inside.
83
+
84
+ # @!method siblings
85
+ # Siblings (flat list) of all nodes inside.
86
+
87
+ # @!method fetch
88
+ # Fetches by name(s) variables for all templates inside.
89
+ #
90
+ # See {Templates::Base#fetch} for explanation.
91
+
92
+ [
93
+ :prev_siblings, :next_siblings, :siblings,
94
+ :fetch
95
+ ].each do |sym|
96
+ define_method(sym){|*args|
97
+ make_nodes map{|n| n.send(sym, *args)}
98
+ }
99
+ end
100
+
101
+ # By list of variable names, fetches hashes of `{name => value}`
102
+ # from all templates inside.
103
+ #
104
+ # See {Templates::Base#fetch_hash} for explanation.
105
+ #
106
+ # @return [Array<Hash>]
107
+ def fetch_hashes(*args)
108
+ map{|t| t.fetch_hash(*args)}
109
+ end
110
+
111
+ # Just join of all {Node#to_tree Node#to_tree} strings inside.
112
+ def to_tree
113
+ map(&:to_tree).join("\n")
114
+ end
115
+
116
+ def inspect
117
+ '[' +
118
+ case
119
+ when count > MAX_CHILDREN
120
+ self[0...MAX_CHILDREN].map(&:inspect).join(', ') + ", ...#{count - MAX_CHILDREN} more nodes"
121
+ else
122
+ map(&:inspect).join(', ')
123
+ end + ']'
124
+ end
125
+
126
+ # Just join of all {Node#text Node#text}s inside.
127
+ def text
128
+ map(&:text).join
129
+ end
130
+
131
+ # Fetches pages by ALL wikilinks inside in ONE query to MediaWiki
132
+ # API.
133
+ #
134
+ # **NB**: for now, if there's more then 50 wikilinks (limitation for
135
+ # one request to API), Infoboxer **will not** try to do next page.
136
+ # It will be fixed in next releases.
137
+ #
138
+ # @return [Nodes<MediaWiki::Page>] It is still `Nodes`, so you
139
+ # still can process them uniformely.
140
+ def follow
141
+ links = select{|n| n.respond_to?(:link)}.map(&:link)
142
+ return Nodes[] if links.empty?
143
+ page = first.lookup_parents(MediaWiki::Page).first or
144
+ fail("Not in a page from real source")
145
+ page.client or fail("MediaWiki client not set")
146
+ page.client.get(*links)
147
+ end
148
+
149
+ # Internal, used by {Parser}
150
+ def <<(node)
151
+ if node.kind_of?(Array)
152
+ node.each{|n| self << n}
153
+ elsif last && last.can_merge?(node)
154
+ last.merge!(node)
155
+ else
156
+ return if !node || node.empty?
157
+ node = Text.new(node) if node.is_a?(String)
158
+ super
159
+ end
160
+ end
161
+
162
+ # Internal, used by {Parser}
163
+ def strip
164
+ res = dup
165
+ res.pop while res.last.is_a?(Text) && res.last.raw_text =~ /^\s*$/
166
+ res.last.raw_text.sub!(/\s+$/, '') if res.last.is_a?(Text)
167
+ res
168
+ end
169
+
170
+ # Internal, used by {Parser}
171
+ def flow_templates
172
+ make_nodes map{|n| n.is_a?(Paragraph) ? n.to_templates? : n}
173
+ end
174
+
175
+ private
176
+
177
+ # @private For inspect shortening
178
+ MAX_CHILDREN = 5
179
+
180
+ def make_nodes(arr)
181
+ Nodes[*arr.flatten]
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,122 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Base class for all "paragraph-level" nodes: {Paragraph}, {ListItem},
5
+ # {Heading}. It should be convenient to use it in {Navigation::Lookup::Node#_lookup Node#lookup}
6
+ # and similar methods like this:
7
+ #
8
+ # ```ruby
9
+ # page.lookup(:BaseParagraph) # => flat list of paragraph-levels
10
+ # ```
11
+ class BaseParagraph < Compound
12
+ def text
13
+ super.strip + "\n\n"
14
+ end
15
+ end
16
+
17
+ # @private
18
+ # Internal! Nothing to see here! Just YARD `@private` tag not working at class level
19
+ class EmptyParagraph < Node
20
+ def initialize(text)
21
+ @text = text
22
+ end
23
+
24
+ # should never be left in nodes flow
25
+ def empty?
26
+ true
27
+ end
28
+
29
+ attr_reader :text
30
+ end
31
+
32
+ # @private
33
+ # Internal! Nothing to see here! Just YARD `@private` tag not working at class level
34
+ module Mergeable
35
+ def can_merge?(other)
36
+ !closed? && self.class == other.class
37
+ end
38
+
39
+ def merge!(other)
40
+ if other.is_a?(EmptyParagraph)
41
+ @closed = true
42
+ else
43
+ [splitter, *other.children].each do |c|
44
+ @children << c
45
+ end
46
+ @closed = other.closed?
47
+ end
48
+ end
49
+ end
50
+
51
+ # @private
52
+ # Internal! Nothing to see here! Just YARD `@private` tag not working at class level
53
+ class MergeableParagraph < BaseParagraph
54
+ include Mergeable
55
+
56
+ def can_merge?(other)
57
+ !closed? &&
58
+ (self.class == other.class || other.is_a?(EmptyParagraph))
59
+ end
60
+ end
61
+
62
+ # Represents plain text paragraph.
63
+ class Paragraph < MergeableParagraph
64
+ # Internal, used by {Parser} for merging
65
+ def splitter
66
+ Text.new(' ')
67
+ end
68
+
69
+ # Internal, used by {Parser}
70
+ def templates_only?
71
+ children.all?{|c| c.is_a?(Template) || c.is_a?(Text) && c.raw_text.strip.empty?}
72
+ end
73
+
74
+ # Internal, used by {Parser}
75
+ def to_templates
76
+ children.select(&filter(itself: Template))
77
+ end
78
+
79
+ # Internal, used by {Parser}
80
+ def to_templates?
81
+ templates_only? ? to_templates : self
82
+ end
83
+ end
84
+
85
+ # Represents horisontal ruler splitter. Rarely seen in modern wikis.
86
+ class HR < Node
87
+ end
88
+
89
+ # Represents heading.
90
+ #
91
+ # NB: min heading level in MediaWiki is 2, Heading level 1 (page
92
+ # title) is not seen in page flaw.
93
+ class Heading < BaseParagraph
94
+ def initialize(children, level)
95
+ super(children, level: level)
96
+ end
97
+
98
+ # @!attribute [r] level
99
+ # @return [Fixnum] lesser numbers is more important heading
100
+ def_readers :level
101
+ end
102
+
103
+ # Represents preformatted text chunk.
104
+ #
105
+ # Paragraph-level thing, can contain many lines of text.
106
+ class Pre < MergeableParagraph
107
+ # Internal, used by {Parser}
108
+ def merge!(other)
109
+ if other.is_a?(EmptyParagraph) && !other.text.empty?
110
+ @children.last.raw_text << "\n" << other.text.sub(/^ /, '')
111
+ else
112
+ super
113
+ end
114
+ end
115
+
116
+ # Internal, used by {Parser} for merging
117
+ def splitter
118
+ Text.new("\n")
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents footnote.
5
+ #
6
+ # Is not rendered in text flow, so, wikitext like
7
+ #
8
+ # ```
9
+ # ...pushed it back into underdevelopment,<ref>...tons of footnote text...</ref> though it nevertheless...
10
+ # ```
11
+ # when parsed and {Node#text} called, will return text like:
12
+ #
13
+ # ```
14
+ # ...pushed it back into underdevelopment, though it nevertheless...
15
+ # ```
16
+ # ...which most times is most reasonable thing to do.
17
+ class Ref < Compound
18
+ # @!attribute [r] name
19
+ def_readers :name
20
+
21
+ # Internal, used by {Parser}
22
+ def empty?
23
+ # even empty tag should not be dropped!
24
+ false
25
+ end
26
+
27
+ def text
28
+ # because we want "clean" text,
29
+ # without references & footnotes messed up in it
30
+ ''
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,89 @@
1
+ # encoding: utf-8
2
+ require 'terminal-table'
3
+
4
+ module Infoboxer
5
+ module Tree
6
+ # Represents table. Tables are complicated!
7
+ class Table < Compound
8
+ # Internal, used by {Parser}
9
+ def empty?
10
+ false
11
+ end
12
+
13
+ # All table rows.
14
+ def rows
15
+ children.select(&fltr(itself: TableRow))
16
+ end
17
+
18
+ # Table caption, if exists.
19
+ def caption
20
+ children.detect(&fltr(itself: TableCaption))
21
+ end
22
+
23
+ # For now, returns first table row, if it consists only of
24
+ # {TableHeading}s.
25
+ #
26
+ # FIXME: it can easily be several table heading rows
27
+ def heading_row
28
+ rows.first.children.all?(&call(matches?: TableHeading)) ?
29
+ rows.first : nil
30
+ end
31
+
32
+ # For now, returns all table rows except {heading_row}
33
+ def body_rows
34
+ rows.first.children.all?(&call(matches?: TableHeading)) ?
35
+ rows[1..-1] :
36
+ rows
37
+ end
38
+
39
+ def text
40
+ table = Terminal::Table.new
41
+ if caption
42
+ table.title = caption.text.sub(/\n+\Z/, '')
43
+ end
44
+
45
+ if heading_row
46
+ table.headings = heading_row.children.map(&:text).
47
+ map(&call(sub: [/\n+\Z/, '']))
48
+ end
49
+
50
+ table.rows = body_rows.map{|r|
51
+ r.children.map(&:text).
52
+ map(&call(sub: [/\n+\Z/, '']))
53
+ }
54
+ table.to_s + "\n\n"
55
+ end
56
+ end
57
+
58
+ # Represents one table row.
59
+ class TableRow < Compound
60
+ alias_method :cells, :children
61
+
62
+ def empty?
63
+ false
64
+ end
65
+ end
66
+
67
+ # Represents any table cell, either {TableCell cell} or
68
+ # {TableHeading heading}.
69
+ #
70
+ # Can be used for lookups (same way as {BaseParagraph}).
71
+ class BaseCell < Compound
72
+ def empty?
73
+ false
74
+ end
75
+ end
76
+
77
+ # Represents ordinary table cell (`td` in HTML).
78
+ class TableCell < BaseCell
79
+ end
80
+
81
+ # Represents table heading cell (`th` in HTML).
82
+ class TableHeading < BaseCell
83
+ end
84
+
85
+ # Represents table caption.
86
+ class TableCaption < Compound
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,82 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Template variable.
5
+ #
6
+ # It's basically the thing with name and ANY nodes inside, can be
7
+ # seen only as a direct child of {Template}.
8
+ class Var < Compound
9
+ attr_reader :name
10
+
11
+ def initialize(name, children = Nodes[])
12
+ super(children)
13
+ @name = name
14
+ end
15
+
16
+ # Internal, used by {Parser}
17
+ def empty?
18
+ false
19
+ end
20
+
21
+ protected
22
+
23
+ def descr
24
+ "#{clean_class}(#{name})"
25
+ end
26
+
27
+ def _eq(other)
28
+ other.name == name && other.children == children
29
+ end
30
+ end
31
+
32
+ # Wikipedia template.
33
+ #
34
+ # Templates are complicated! Also, they are useful.
35
+ #
36
+ # You'd need to understand them from [Wikipedia docs](https://en.wikipedia.org/wiki/Wikipedia:Templates)
37
+ # and then use much of Infoboxer's goodness provided with {Templates}
38
+ # separate module.
39
+ class Template < Compound
40
+ attr_reader :name, :variables
41
+
42
+ def initialize(name, variables = Nodes[])
43
+ super(Nodes[], extract_params(variables))
44
+ @name = name
45
+ @variables = Nodes[*variables].each{|v| v.parent = self}
46
+ end
47
+
48
+ # See {Node#to_tree}
49
+ def to_tree(level = 0)
50
+ ' ' * level + "<#{descr}>\n" +
51
+ variables.map{|var| var.to_tree(level+1)}.join
52
+ end
53
+
54
+ # Internal, used by {Parser}.
55
+ def empty?
56
+ false
57
+ end
58
+
59
+ protected
60
+
61
+ def _eq(other)
62
+ other.name == name && other.variables == variables
63
+ end
64
+
65
+ def clean_class
66
+ "Template[#{name}]"
67
+ end
68
+
69
+ def extract_params(vars)
70
+ # NB: backports' to_h is cleaner but has performance penalty :(
71
+ Hash[*vars.
72
+ select{|v| v.children.count == 1 && v.children.first.is_a?(Text)}.
73
+ map{|v| [v.name, v.children.first.raw_text]}.flatten(1)]
74
+ end
75
+
76
+ def inspect_variables(depth)
77
+ variables.to_a[0..1].map{|name, var| "#{name}: #{var.inspect(depth+1)}"}.join(', ') +
78
+ (variables.count > 2 ? ', ...' : '')
79
+ end
80
+ end
81
+ end
82
+ end