infoboxer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Base class for all nodes with children.
5
+ class Compound < Node
6
+ def initialize(children = Nodes.new, params = {})
7
+ super(params)
8
+ @children = Nodes[*children]
9
+ @children.each{|c| c.parent = self}
10
+ end
11
+
12
+ # List of children
13
+ #
14
+ # @return {Nodes}
15
+ attr_reader :children
16
+
17
+ # Index of provided node in children list
18
+ #
19
+ # @return [Fixnum] or `nil` if not a child
20
+ def index_of(child)
21
+ children.index(child)
22
+ end
23
+
24
+ # Internal, used by {Parser}
25
+ def push_children(*nodes)
26
+ nodes.each{|c| c.parent = self}.each do |n|
27
+ @children << n
28
+ end
29
+ end
30
+
31
+ # See {Node#text}
32
+ def text
33
+ children.map(&:text).join(children_separator)
34
+ end
35
+
36
+ # See {Node#to_tree}
37
+ def to_tree(level = 0)
38
+ if children.count == 1 && children.first.is_a?(Text)
39
+ "#{indent(level)}#{children.first.text} <#{descr}>\n"
40
+ else
41
+ "#{indent(level)}<#{descr}>\n" +
42
+ children.map(&call(to_tree: level+1)).join
43
+ end
44
+ end
45
+
46
+ # Kinda "private" methods, used by Parser only -------------------
47
+
48
+ # Internal, used by {Parser}
49
+ def can_merge?(other)
50
+ false
51
+ end
52
+
53
+ # Internal, used by {Parser}
54
+ def closed!
55
+ @closed = true
56
+ end
57
+
58
+ # Internal, used by {Parser}
59
+ def closed?
60
+ @closed
61
+ end
62
+
63
+ # Internal, used by {Parser}
64
+ def empty?
65
+ children.empty?
66
+ end
67
+
68
+ protected
69
+
70
+ def children_separator
71
+ ''
72
+ end
73
+
74
+ private
75
+
76
+ def _eq(other)
77
+ children == other.children
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents entire document.
5
+ #
6
+ # Alongside with standard compound node functionality, is a
7
+ # {Navigation::Sections::Container}
8
+ class Document < Compound
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,76 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ module HTMLTagCommons
5
+ BLOCK_TAGS = %w[div p br] # FIXME: are some other used in WP?
6
+
7
+ def text
8
+ super + (BLOCK_TAGS.include?(tag) ? "\n" : '')
9
+ end
10
+ end
11
+
12
+ # Represents HTML tag, surrounding some contents.
13
+ class HTMLTag < Compound
14
+ def initialize(tag, attrs, children = Nodes.new)
15
+ super(children, attrs)
16
+ @tag = tag
17
+ end
18
+
19
+ attr_reader :tag
20
+ alias_method :attrs, :params
21
+
22
+ include HTMLTagCommons
23
+
24
+ # Internal, used by {Parser}.
25
+ def empty?
26
+ # even empty tag, for ex., <br>, should not be dropped!
27
+ false
28
+ end
29
+
30
+ private
31
+
32
+ def descr
33
+ "#{clean_class}:#{tag}(#{show_params})"
34
+ end
35
+ end
36
+
37
+ # Represents orphan opening HTML tag.
38
+ #
39
+ # NB: Infoboxer not tries to parse entire structure of HTML-heavy
40
+ # MediaWiki articles. So, if you have `<div>` at line 150 and closing
41
+ # `</div>` at line 875, there would be orphane `HTMLOpeningTag` and
42
+ # {HTMLClosingTag}. It is not always convenient, but reasonable enough.
43
+ #
44
+ class HTMLOpeningTag < Node
45
+ def initialize(tag, attrs)
46
+ super(attrs)
47
+ @tag = tag
48
+ end
49
+
50
+ attr_reader :tag
51
+ alias_method :attrs, :params
52
+
53
+ include HTMLTagCommons
54
+
55
+ private
56
+
57
+ def descr
58
+ "#{clean_class}:#{tag}(#{show_params})"
59
+ end
60
+ end
61
+
62
+ # Represents orphan closing HTML tag. See {HTMLOpeningTag} for
63
+ # explanation.
64
+ class HTMLClosingTag < Node
65
+ def initialize(tag)
66
+ @tag = tag
67
+ end
68
+
69
+ attr_reader :tag
70
+
71
+ def descr
72
+ "#{clean_class}:#{tag}"
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents image (or other media file).
5
+ #
6
+ # See [Wikipedia Tutorial](https://en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax)
7
+ # for explanation of attributes.
8
+ class Image < Node
9
+ def initialize(path, params = {})
10
+ @caption = params.delete(:caption)
11
+ super({path: path}.merge(params))
12
+ end
13
+
14
+ # Image caption. Can have (sometimes many) other nodes inside.
15
+ #
16
+ # @return [Nodes]
17
+ attr_reader :caption
18
+
19
+ # @!attribute [r] path
20
+ # @!attribute [r] type
21
+ # @!attribute [r] location
22
+ # @!attribute [r] alignment
23
+ # @!attribute [r] link
24
+ # @!attribute [r] alt
25
+
26
+ def_readers :path, :type,
27
+ :location, :alignment, :link,
28
+ :alt
29
+
30
+ def border?
31
+ !params[:border].to_s.empty?
32
+ end
33
+
34
+ def width
35
+ params[:width].to_i
36
+ end
37
+
38
+ def height
39
+ params[:height].to_i
40
+ end
41
+
42
+ def to_tree(level = 0)
43
+ super(level) +
44
+ if caption && !caption.empty?
45
+ indent(level+1) + "caption:\n" +
46
+ caption.map(&call(to_tree: level+2)).join
47
+ else
48
+ ''
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents italic text.
5
+ class Italic < Compound
6
+ end
7
+
8
+ # Represents bold text.
9
+ class Bold < Compound
10
+ end
11
+
12
+ # Represents bold italic text (and no, it's not a comb of bold+italic,
13
+ # from Wikipedia's markup point of view).
14
+ class BoldItalic < Compound
15
+ end
16
+
17
+ # Base class for internal/external links,
18
+ class Link < Compound
19
+ def initialize(link, label = nil)
20
+ super(label || Nodes.new([Text.new(link)]), link: link)
21
+ end
22
+
23
+ #@!attribute [r] link
24
+
25
+ def_readers :link
26
+ end
27
+
28
+ # External link. Has other nodes as a contents, and, err, link (url).
29
+ class ExternalLink < Link
30
+
31
+ #@!attribute [r] url
32
+ # synonym for `#link`
33
+
34
+ alias_method :url, :link
35
+ end
36
+ end
37
+ end
38
+
39
+ require_relative 'wikilink'
@@ -0,0 +1,160 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents item of ordered or unordered list.
5
+ class ListItem < BaseParagraph
6
+ # Internal, used by {Parser}
7
+ def can_merge?(other)
8
+ other.class == self.class &&
9
+ other.children.first.kind_of?(List)
10
+ end
11
+
12
+ # Internal, used by {Parser}
13
+ def merge!(other)
14
+ ochildren = other.children.dup
15
+ if children.last && children.last.can_merge?(ochildren.first)
16
+ children.last.merge!(ochildren.shift)
17
+ end
18
+ push_children(*ochildren)
19
+ end
20
+
21
+ def text
22
+ make_marker + if children.last.is_a?(List)
23
+ children[0..-2].map(&:text).join + "\n" + children.last.text
24
+ else
25
+ children.map(&:text).join + "\n"
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def make_marker
32
+ parent ? parent.make_marker(self) : '* '
33
+ end
34
+ end
35
+
36
+ # "Imaginary" node, grouping {ListItem}s of same level and type.
37
+ #
38
+ # Base for concrete {OrderedList}, {UnorderedList} and {DefinitionList}.
39
+ #
40
+ # NB: Nested lists are represented by structures like:
41
+ #
42
+ # ```
43
+ # <OrderedList>
44
+ # <ListItem>
45
+ # <ListItem>
46
+ # <Text>
47
+ # <UnorderedList>
48
+ # <ListItem>
49
+ # <ListItem>
50
+ # ...and so on
51
+ # ```
52
+ class List < Compound
53
+ def list_level
54
+ lookup_parents(List).count
55
+ end
56
+
57
+ def list_text_indent
58
+ ' ' * list_level
59
+ end
60
+
61
+ def text
62
+ if list_level.zero?
63
+ super.sub(/\n+\Z/, "\n\n")
64
+ else
65
+ super.sub(/\n+\Z/, "\n")
66
+ end
67
+ end
68
+ end
69
+
70
+ # Represents unordered list (list with markers).
71
+ class UnorderedList < List
72
+ def make_marker(item)
73
+ list_text_indent + '* '
74
+ end
75
+ end
76
+
77
+ # Represents ordered list (list with numbers).
78
+ class OrderedList < List
79
+ def make_marker(item)
80
+ list_text_indent + "#{(item.index + 1)}. "
81
+ end
82
+ end
83
+
84
+ # Represents definitions list (`term: definition` structure),
85
+ # consists of {DTerm}s and {DDefinition}s.
86
+ #
87
+ # NB: In fact, at least in English Wikipedia, orphan "definition terms"
88
+ # are used as a low-level headers, especially in lists of links/references.
89
+ class DefinitionList < List
90
+ def make_marker(item)
91
+ case item
92
+ when DTerm
93
+ list_text_indent
94
+ when DDefinition
95
+ list_text_indent + ' '
96
+ end
97
+ end
98
+ end
99
+
100
+ # Term in {DefinitionList}
101
+ class DTerm < ListItem
102
+ def text
103
+ super.sub("\n", ":\n")
104
+ end
105
+ end
106
+
107
+ # Term definition in {DefinitionList}
108
+ class DDefinition < ListItem
109
+ end
110
+
111
+ class List < Compound
112
+ include Mergeable
113
+
114
+ # Internal, used by {Parser}
115
+ def merge!(other)
116
+ ochildren = other.children.dup
117
+ if children.last && ochildren.first &&
118
+ children.last.can_merge?(ochildren.first)
119
+
120
+ children.last.merge!(ochildren.shift)
121
+ end
122
+
123
+ push_children(*ochildren)
124
+ end
125
+
126
+ # Internal, used by {Parser}
127
+ def self.construct(marker, nodes)
128
+ m = marker.shift
129
+ klass = LISTS[m] or
130
+ fail("Something went wrong: undefined list marker type #{m}")
131
+ item_klass = ITEMS[m]
132
+
133
+ if marker.empty?
134
+ klass.new(item_klass.new(nodes))
135
+ else
136
+ klass.new(item_klass.new(construct(marker, nodes)))
137
+ end
138
+ end
139
+
140
+ private
141
+
142
+ # @private
143
+ LISTS = {
144
+ ';' => DefinitionList,
145
+ ':' => DefinitionList,
146
+ '*' => UnorderedList,
147
+ '#' => OrderedList
148
+ }
149
+
150
+ # @private
151
+ ITEMS = {
152
+ ';' => DTerm,
153
+ ':' => DDefinition,
154
+ '*' => ListItem,
155
+ '#' => ListItem
156
+ }
157
+
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,181 @@
1
+ # encoding: utf-8
2
+ require 'htmlentities'
3
+
4
+ module Infoboxer
5
+ module Tree
6
+ # This is the base class for all parse tree nodes.
7
+ #
8
+ # Basically, you'll
9
+ # never create instances of this class or its descendants by yourself,
10
+ # you will receive it from tree and use for navigations.
11
+ #
12
+ class Node
13
+ include ProcMe
14
+
15
+ def initialize(params = {})
16
+ @params = params
17
+ end
18
+
19
+ # Hash of node "params".
20
+ #
21
+ # Params notin is roughly the same as tag attributes in HTML. This
22
+ # is actual for complex nodes like images, tables, raw HTML tags and
23
+ # so on.
24
+ #
25
+ # The most actual params are typically exposed by node as instance
26
+ # methods (like {Heading#level}).
27
+ #
28
+ # @return [Hash]
29
+ attr_reader :params
30
+
31
+ # Node's parent in tree
32
+ # @return {Node}
33
+ attr_accessor :parent
34
+
35
+ def ==(other)
36
+ self.class == other.class && _eq(other)
37
+ end
38
+
39
+ # Position in parent's children array (zero-based)
40
+ def index
41
+ parent ? parent.index_of(self) : 0
42
+ end
43
+
44
+ # List of all sibling nodes (children of same parent)
45
+ def siblings
46
+ parent ? parent.children - [self] : Nodes[]
47
+ end
48
+
49
+ # List of siblings before this one
50
+ def prev_siblings
51
+ siblings.select{|n| n.index < index}
52
+ end
53
+
54
+ # List of siblings after this one
55
+ def next_siblings
56
+ siblings.select{|n| n.index > index}
57
+ end
58
+
59
+ # Node children list
60
+ def children
61
+ Nodes[] # redefined in descendants
62
+ end
63
+
64
+ # Used only during tree construction in {Parser}.
65
+ def can_merge?(other)
66
+ false
67
+ end
68
+
69
+ # Whether node is empty (definition of "empty" varies for different
70
+ # kinds of nodes). Used mainly in {Parser}.
71
+ def empty?
72
+ false
73
+ end
74
+
75
+ # Textual representation of this node and its children, ready for
76
+ # pretty-printing. Use it like this:
77
+ #
78
+ # ```ruby
79
+ # puts page.lookup(:Paragraph).first.to_tree
80
+ # # Prints something like
81
+ # # <Paragraph>
82
+ # # This <Italic>
83
+ # # is <Text>
84
+ # # <Wikilink(link: "Argentina")>
85
+ # # pretty <Italic>
86
+ # # complicated <Text>
87
+ # ```
88
+ #
89
+ # Useful for understanding page structure, and Infoboxer's representation
90
+ # of this structure
91
+ def to_tree(level = 0)
92
+ indent(level) + "<#{descr}>\n"
93
+ end
94
+
95
+ def inspect
96
+ text.empty? ? "#<#{descr}>" : "#<#{descr}: #{shorten_text}>"
97
+ end
98
+
99
+ # Node text representation. It is defined for all nodes so, that
100
+ # entire `Document#text` produce readable text-only representation
101
+ # of Wiki page. Therefore, rules are those:
102
+ # * inline-formatting nodes (text, bold, italics) just return the
103
+ # text;
104
+ # * paragraph-level nodes (headings, paragraphs, lists) add `"\n\n"`
105
+ # after text;
106
+ # * list items add marker before text;
107
+ # * nodes, not belonging to "main" text flow (references, templates)
108
+ # produce empty text.
109
+ #
110
+ # If you want just the text of some heading or list item (without
111
+ # "formatting" quircks), you can use {Node#text_} method.
112
+ #
113
+ def text
114
+ '' # redefined in descendants
115
+ end
116
+
117
+ # "Clean" version of node text: without trailing linefeeds, list
118
+ # markers and other things added for formatting.
119
+ #
120
+ def text_
121
+ text.strip
122
+ end
123
+
124
+ # See {Node#text_}
125
+ def to_s
126
+ # just aliases will not work when #text will be redefined in subclasses
127
+ text_
128
+ end
129
+
130
+ private
131
+
132
+ MAX_CHARS = 30
133
+
134
+ def shorten_text
135
+ text_.length > MAX_CHARS ? text_[0..MAX_CHARS] + '...' : text_
136
+ end
137
+
138
+ def clean_class
139
+ self.class.name.sub(/^.*::/, '')
140
+ end
141
+
142
+ def descr
143
+ if !params || params.empty?
144
+ "#{clean_class}"
145
+ else
146
+ "#{clean_class}(#{show_params})"
147
+ end
148
+ end
149
+
150
+ def show_params(prms = nil)
151
+ (prms || params).map{|k, v| "#{k}: #{v.inspect}"}.join(', ')
152
+ end
153
+
154
+ def indent(level)
155
+ ' ' * level
156
+ end
157
+
158
+ def _eq(other)
159
+ fail(NotImplementedError, "#_eq should be defined in subclasses")
160
+ end
161
+
162
+ def decode(str)
163
+ Node.coder.decode(str)
164
+ end
165
+
166
+ class << self
167
+ # Internal: descendandts DSL
168
+ def def_readers(*keys)
169
+ keys.each do |k|
170
+ define_method(k){ params[k] }
171
+ end
172
+ end
173
+
174
+ # Internal: HTML entities decoder.
175
+ def coder
176
+ @coder ||= HTMLEntities.new
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end