infoboxer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,185 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # List of nodes, which tries to be useful both as array, and as proxy
5
+ # to its contents.
6
+ #
7
+ # Many of Infoboxer's methods (especially {Navigation}'s) return
8
+ # `Nodes`, and in most cases you don't have to think about it. Same
9
+ # approach can be seen in jQuery or Nokogiri. You just do things
10
+ # like those:
11
+ #
12
+ # ```ruby
13
+ # document.sections. # => Nodes returned,
14
+ # select{|section| # you can treat them as array, but also...
15
+ # section.text.length > 1000 #
16
+ # }. #
17
+ # lookup(:Wikilink, text: /Chile/). # ...use Infoboxer's methods
18
+ # follow. # ...even to receive lists of other pages
19
+ # infoboxes. # ...and use methods on them
20
+ # fetch('leader_name1'). # ...including those which only some node types support
21
+ # map(&:text) # ...and still have full-functioning Array
22
+ # ```
23
+ #
24
+ class Nodes < Array
25
+
26
+ # @!method select(&block)
27
+ # Just like Array#select, but returns Nodes
28
+
29
+ # @!method reject(&block)
30
+ # Just like Array#reject, but returns Nodes
31
+
32
+ # @!method sort_by(&block)
33
+ # Just like Array#sort_by, but returns Nodes
34
+
35
+ # @!method flatten
36
+ # Just like Array#flatten, but returns Nodes
37
+
38
+ # @!method compact
39
+ # Just like Array#compact, but returns Nodes
40
+
41
+ # @!method -(other)
42
+ # Just like Array#-, but returns Nodes
43
+
44
+ [:select, :reject, :sort_by, :flatten, :compact, :-].each do |sym|
45
+ define_method(sym){|*args, &block|
46
+ Nodes[*super(*args, &block)]
47
+ }
48
+ end
49
+
50
+ # Just like Array#first, but returns Nodes, if provided with `n` of elements.
51
+ def first(n = nil)
52
+ if n.nil?
53
+ super()
54
+ else
55
+ Nodes[*super(n)]
56
+ end
57
+ end
58
+
59
+ # Just like Array#last, but returns Nodes, if provided with `n` of elements.
60
+ def last(n = nil)
61
+ if n.nil?
62
+ super()
63
+ else
64
+ Nodes[*super(n)]
65
+ end
66
+ end
67
+
68
+ # Just like Array#map, but returns Nodes, **if** all map results are Node
69
+ def map
70
+ res = super
71
+ if res.all?{|n| n.is_a?(Node) || n.is_a?(Nodes)}
72
+ Nodes[*res]
73
+ else
74
+ res
75
+ end
76
+ end
77
+
78
+ # @!method prev_siblings
79
+ # Previous siblings (flat list) of all nodes inside.
80
+
81
+ # @!method next_siblings
82
+ # Next siblings (flat list) of all nodes inside.
83
+
84
+ # @!method siblings
85
+ # Siblings (flat list) of all nodes inside.
86
+
87
+ # @!method fetch
88
+ # Fetches by name(s) variables for all templates inside.
89
+ #
90
+ # See {Templates::Base#fetch} for explanation.
91
+
92
+ [
93
+ :prev_siblings, :next_siblings, :siblings,
94
+ :fetch
95
+ ].each do |sym|
96
+ define_method(sym){|*args|
97
+ make_nodes map{|n| n.send(sym, *args)}
98
+ }
99
+ end
100
+
101
+ # By list of variable names, fetches hashes of `{name => value}`
102
+ # from all templates inside.
103
+ #
104
+ # See {Templates::Base#fetch_hash} for explanation.
105
+ #
106
+ # @return [Array<Hash>]
107
+ def fetch_hashes(*args)
108
+ map{|t| t.fetch_hash(*args)}
109
+ end
110
+
111
+ # Just join of all {Node#to_tree Node#to_tree} strings inside.
112
+ def to_tree
113
+ map(&:to_tree).join("\n")
114
+ end
115
+
116
+ def inspect
117
+ '[' +
118
+ case
119
+ when count > MAX_CHILDREN
120
+ self[0...MAX_CHILDREN].map(&:inspect).join(', ') + ", ...#{count - MAX_CHILDREN} more nodes"
121
+ else
122
+ map(&:inspect).join(', ')
123
+ end + ']'
124
+ end
125
+
126
+ # Just join of all {Node#text Node#text}s inside.
127
+ def text
128
+ map(&:text).join
129
+ end
130
+
131
+ # Fetches pages by ALL wikilinks inside in ONE query to MediaWiki
132
+ # API.
133
+ #
134
+ # **NB**: for now, if there's more then 50 wikilinks (limitation for
135
+ # one request to API), Infoboxer **will not** try to do next page.
136
+ # It will be fixed in next releases.
137
+ #
138
+ # @return [Nodes<MediaWiki::Page>] It is still `Nodes`, so you
139
+ # still can process them uniformely.
140
+ def follow
141
+ links = select{|n| n.respond_to?(:link)}.map(&:link)
142
+ return Nodes[] if links.empty?
143
+ page = first.lookup_parents(MediaWiki::Page).first or
144
+ fail("Not in a page from real source")
145
+ page.client or fail("MediaWiki client not set")
146
+ page.client.get(*links)
147
+ end
148
+
149
+ # Internal, used by {Parser}
150
+ def <<(node)
151
+ if node.kind_of?(Array)
152
+ node.each{|n| self << n}
153
+ elsif last && last.can_merge?(node)
154
+ last.merge!(node)
155
+ else
156
+ return if !node || node.empty?
157
+ node = Text.new(node) if node.is_a?(String)
158
+ super
159
+ end
160
+ end
161
+
162
+ # Internal, used by {Parser}
163
+ def strip
164
+ res = dup
165
+ res.pop while res.last.is_a?(Text) && res.last.raw_text =~ /^\s*$/
166
+ res.last.raw_text.sub!(/\s+$/, '') if res.last.is_a?(Text)
167
+ res
168
+ end
169
+
170
+ # Internal, used by {Parser}
171
+ def flow_templates
172
+ make_nodes map{|n| n.is_a?(Paragraph) ? n.to_templates? : n}
173
+ end
174
+
175
+ private
176
+
177
+ # @private For inspect shortening
178
+ MAX_CHILDREN = 5
179
+
180
+ def make_nodes(arr)
181
+ Nodes[*arr.flatten]
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,122 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Base class for all "paragraph-level" nodes: {Paragraph}, {ListItem},
5
+ # {Heading}. It should be convenient to use it in {Navigation::Lookup::Node#_lookup Node#lookup}
6
+ # and similar methods like this:
7
+ #
8
+ # ```ruby
9
+ # page.lookup(:BaseParagraph) # => flat list of paragraph-levels
10
+ # ```
11
+ class BaseParagraph < Compound
12
+ def text
13
+ super.strip + "\n\n"
14
+ end
15
+ end
16
+
17
+ # @private
18
+ # Internal! Nothing to see here! Just YARD `@private` tag not working at class level
19
+ class EmptyParagraph < Node
20
+ def initialize(text)
21
+ @text = text
22
+ end
23
+
24
+ # should never be left in nodes flow
25
+ def empty?
26
+ true
27
+ end
28
+
29
+ attr_reader :text
30
+ end
31
+
32
+ # @private
33
+ # Internal! Nothing to see here! Just YARD `@private` tag not working at class level
34
+ module Mergeable
35
+ def can_merge?(other)
36
+ !closed? && self.class == other.class
37
+ end
38
+
39
+ def merge!(other)
40
+ if other.is_a?(EmptyParagraph)
41
+ @closed = true
42
+ else
43
+ [splitter, *other.children].each do |c|
44
+ @children << c
45
+ end
46
+ @closed = other.closed?
47
+ end
48
+ end
49
+ end
50
+
51
+ # @private
52
+ # Internal! Nothing to see here! Just YARD `@private` tag not working at class level
53
+ class MergeableParagraph < BaseParagraph
54
+ include Mergeable
55
+
56
+ def can_merge?(other)
57
+ !closed? &&
58
+ (self.class == other.class || other.is_a?(EmptyParagraph))
59
+ end
60
+ end
61
+
62
+ # Represents plain text paragraph.
63
+ class Paragraph < MergeableParagraph
64
+ # Internal, used by {Parser} for merging
65
+ def splitter
66
+ Text.new(' ')
67
+ end
68
+
69
+ # Internal, used by {Parser}
70
+ def templates_only?
71
+ children.all?{|c| c.is_a?(Template) || c.is_a?(Text) && c.raw_text.strip.empty?}
72
+ end
73
+
74
+ # Internal, used by {Parser}
75
+ def to_templates
76
+ children.select(&filter(itself: Template))
77
+ end
78
+
79
+ # Internal, used by {Parser}
80
+ def to_templates?
81
+ templates_only? ? to_templates : self
82
+ end
83
+ end
84
+
85
+ # Represents horisontal ruler splitter. Rarely seen in modern wikis.
86
+ class HR < Node
87
+ end
88
+
89
+ # Represents heading.
90
+ #
91
+ # NB: min heading level in MediaWiki is 2, Heading level 1 (page
92
+ # title) is not seen in page flaw.
93
+ class Heading < BaseParagraph
94
+ def initialize(children, level)
95
+ super(children, level: level)
96
+ end
97
+
98
+ # @!attribute [r] level
99
+ # @return [Fixnum] lesser numbers is more important heading
100
+ def_readers :level
101
+ end
102
+
103
+ # Represents preformatted text chunk.
104
+ #
105
+ # Paragraph-level thing, can contain many lines of text.
106
+ class Pre < MergeableParagraph
107
+ # Internal, used by {Parser}
108
+ def merge!(other)
109
+ if other.is_a?(EmptyParagraph) && !other.text.empty?
110
+ @children.last.raw_text << "\n" << other.text.sub(/^ /, '')
111
+ else
112
+ super
113
+ end
114
+ end
115
+
116
+ # Internal, used by {Parser} for merging
117
+ def splitter
118
+ Text.new("\n")
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents footnote.
5
+ #
6
+ # Is not rendered in text flow, so, wikitext like
7
+ #
8
+ # ```
9
+ # ...pushed it back into underdevelopment,<ref>...tons of footnote text...</ref> though it nevertheless...
10
+ # ```
11
+ # when parsed and {Node#text} called, will return text like:
12
+ #
13
+ # ```
14
+ # ...pushed it back into underdevelopment, though it nevertheless...
15
+ # ```
16
+ # ...which most times is most reasonable thing to do.
17
+ class Ref < Compound
18
+ # @!attribute [r] name
19
+ def_readers :name
20
+
21
+ # Internal, used by {Parser}
22
+ def empty?
23
+ # even empty tag should not be dropped!
24
+ false
25
+ end
26
+
27
+ def text
28
+ # because we want "clean" text,
29
+ # without references & footnotes messed up in it
30
+ ''
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,89 @@
1
+ # encoding: utf-8
2
+ require 'terminal-table'
3
+
4
+ module Infoboxer
5
+ module Tree
6
+ # Represents table. Tables are complicated!
7
+ class Table < Compound
8
+ # Internal, used by {Parser}
9
+ def empty?
10
+ false
11
+ end
12
+
13
+ # All table rows.
14
+ def rows
15
+ children.select(&fltr(itself: TableRow))
16
+ end
17
+
18
+ # Table caption, if exists.
19
+ def caption
20
+ children.detect(&fltr(itself: TableCaption))
21
+ end
22
+
23
+ # For now, returns first table row, if it consists only of
24
+ # {TableHeading}s.
25
+ #
26
+ # FIXME: it can easily be several table heading rows
27
+ def heading_row
28
+ rows.first.children.all?(&call(matches?: TableHeading)) ?
29
+ rows.first : nil
30
+ end
31
+
32
+ # For now, returns all table rows except {heading_row}
33
+ def body_rows
34
+ rows.first.children.all?(&call(matches?: TableHeading)) ?
35
+ rows[1..-1] :
36
+ rows
37
+ end
38
+
39
+ def text
40
+ table = Terminal::Table.new
41
+ if caption
42
+ table.title = caption.text.sub(/\n+\Z/, '')
43
+ end
44
+
45
+ if heading_row
46
+ table.headings = heading_row.children.map(&:text).
47
+ map(&call(sub: [/\n+\Z/, '']))
48
+ end
49
+
50
+ table.rows = body_rows.map{|r|
51
+ r.children.map(&:text).
52
+ map(&call(sub: [/\n+\Z/, '']))
53
+ }
54
+ table.to_s + "\n\n"
55
+ end
56
+ end
57
+
58
+ # Represents one table row.
59
+ class TableRow < Compound
60
+ alias_method :cells, :children
61
+
62
+ def empty?
63
+ false
64
+ end
65
+ end
66
+
67
+ # Represents any table cell, either {TableCell cell} or
68
+ # {TableHeading heading}.
69
+ #
70
+ # Can be used for lookups (same way as {BaseParagraph}).
71
+ class BaseCell < Compound
72
+ def empty?
73
+ false
74
+ end
75
+ end
76
+
77
+ # Represents ordinary table cell (`td` in HTML).
78
+ class TableCell < BaseCell
79
+ end
80
+
81
+ # Represents table heading cell (`th` in HTML).
82
+ class TableHeading < BaseCell
83
+ end
84
+
85
+ # Represents table caption.
86
+ class TableCaption < Compound
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,82 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Template variable.
5
+ #
6
+ # It's basically the thing with name and ANY nodes inside, can be
7
+ # seen only as a direct child of {Template}.
8
+ class Var < Compound
9
+ attr_reader :name
10
+
11
+ def initialize(name, children = Nodes[])
12
+ super(children)
13
+ @name = name
14
+ end
15
+
16
+ # Internal, used by {Parser}
17
+ def empty?
18
+ false
19
+ end
20
+
21
+ protected
22
+
23
+ def descr
24
+ "#{clean_class}(#{name})"
25
+ end
26
+
27
+ def _eq(other)
28
+ other.name == name && other.children == children
29
+ end
30
+ end
31
+
32
+ # Wikipedia template.
33
+ #
34
+ # Templates are complicated! Also, they are useful.
35
+ #
36
+ # You'd need to understand them from [Wikipedia docs](https://en.wikipedia.org/wiki/Wikipedia:Templates)
37
+ # and then use much of Infoboxer's goodness provided with {Templates}
38
+ # separate module.
39
+ class Template < Compound
40
+ attr_reader :name, :variables
41
+
42
+ def initialize(name, variables = Nodes[])
43
+ super(Nodes[], extract_params(variables))
44
+ @name = name
45
+ @variables = Nodes[*variables].each{|v| v.parent = self}
46
+ end
47
+
48
+ # See {Node#to_tree}
49
+ def to_tree(level = 0)
50
+ ' ' * level + "<#{descr}>\n" +
51
+ variables.map{|var| var.to_tree(level+1)}.join
52
+ end
53
+
54
+ # Internal, used by {Parser}.
55
+ def empty?
56
+ false
57
+ end
58
+
59
+ protected
60
+
61
+ def _eq(other)
62
+ other.name == name && other.variables == variables
63
+ end
64
+
65
+ def clean_class
66
+ "Template[#{name}]"
67
+ end
68
+
69
+ def extract_params(vars)
70
+ # NB: backports' to_h is cleaner but has performance penalty :(
71
+ Hash[*vars.
72
+ select{|v| v.children.count == 1 && v.children.first.is_a?(Text)}.
73
+ map{|v| [v.name, v.children.first.raw_text]}.flatten(1)]
74
+ end
75
+
76
+ def inspect_variables(depth)
77
+ variables.to_a[0..1].map{|name, var| "#{name}: #{var.inspect(depth+1)}"}.join(', ') +
78
+ (variables.count > 2 ? ', ...' : '')
79
+ end
80
+ end
81
+ end
82
+ end