infoboxer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Base class for all nodes with children.
5
+ class Compound < Node
6
+ def initialize(children = Nodes.new, params = {})
7
+ super(params)
8
+ @children = Nodes[*children]
9
+ @children.each{|c| c.parent = self}
10
+ end
11
+
12
+ # List of children
13
+ #
14
+ # @return {Nodes}
15
+ attr_reader :children
16
+
17
+ # Index of provided node in children list
18
+ #
19
+ # @return [Fixnum] or `nil` if not a child
20
+ def index_of(child)
21
+ children.index(child)
22
+ end
23
+
24
+ # Internal, used by {Parser}
25
+ def push_children(*nodes)
26
+ nodes.each{|c| c.parent = self}.each do |n|
27
+ @children << n
28
+ end
29
+ end
30
+
31
+ # See {Node#text}
32
+ def text
33
+ children.map(&:text).join(children_separator)
34
+ end
35
+
36
+ # See {Node#to_tree}
37
+ def to_tree(level = 0)
38
+ if children.count == 1 && children.first.is_a?(Text)
39
+ "#{indent(level)}#{children.first.text} <#{descr}>\n"
40
+ else
41
+ "#{indent(level)}<#{descr}>\n" +
42
+ children.map(&call(to_tree: level+1)).join
43
+ end
44
+ end
45
+
46
+ # Kinda "private" methods, used by Parser only -------------------
47
+
48
+ # Internal, used by {Parser}
49
+ def can_merge?(other)
50
+ false
51
+ end
52
+
53
+ # Internal, used by {Parser}
54
+ def closed!
55
+ @closed = true
56
+ end
57
+
58
+ # Internal, used by {Parser}
59
+ def closed?
60
+ @closed
61
+ end
62
+
63
+ # Internal, used by {Parser}
64
+ def empty?
65
+ children.empty?
66
+ end
67
+
68
+ protected
69
+
70
+ def children_separator
71
+ ''
72
+ end
73
+
74
+ private
75
+
76
+ def _eq(other)
77
+ children == other.children
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents entire document.
5
+ #
6
+ # Alongside with standard compound node functionality, is a
7
+ # {Navigation::Sections::Container}
8
+ class Document < Compound
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,76 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ module HTMLTagCommons
5
+ BLOCK_TAGS = %w[div p br] # FIXME: are some other used in WP?
6
+
7
+ def text
8
+ super + (BLOCK_TAGS.include?(tag) ? "\n" : '')
9
+ end
10
+ end
11
+
12
+ # Represents HTML tag, surrounding some contents.
13
+ class HTMLTag < Compound
14
+ def initialize(tag, attrs, children = Nodes.new)
15
+ super(children, attrs)
16
+ @tag = tag
17
+ end
18
+
19
+ attr_reader :tag
20
+ alias_method :attrs, :params
21
+
22
+ include HTMLTagCommons
23
+
24
+ # Internal, used by {Parser}.
25
+ def empty?
26
+ # even empty tag, for ex., <br>, should not be dropped!
27
+ false
28
+ end
29
+
30
+ private
31
+
32
+ def descr
33
+ "#{clean_class}:#{tag}(#{show_params})"
34
+ end
35
+ end
36
+
37
+ # Represents orphan opening HTML tag.
38
+ #
39
+ # NB: Infoboxer not tries to parse entire structure of HTML-heavy
40
+ # MediaWiki articles. So, if you have `<div>` at line 150 and closing
41
+ # `</div>` at line 875, there would be orphane `HTMLOpeningTag` and
42
+ # {HTMLClosingTag}. It is not always convenient, but reasonable enough.
43
+ #
44
+ class HTMLOpeningTag < Node
45
+ def initialize(tag, attrs)
46
+ super(attrs)
47
+ @tag = tag
48
+ end
49
+
50
+ attr_reader :tag
51
+ alias_method :attrs, :params
52
+
53
+ include HTMLTagCommons
54
+
55
+ private
56
+
57
+ def descr
58
+ "#{clean_class}:#{tag}(#{show_params})"
59
+ end
60
+ end
61
+
62
+ # Represents orphan closing HTML tag. See {HTMLOpeningTag} for
63
+ # explanation.
64
+ class HTMLClosingTag < Node
65
+ def initialize(tag)
66
+ @tag = tag
67
+ end
68
+
69
+ attr_reader :tag
70
+
71
+ def descr
72
+ "#{clean_class}:#{tag}"
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents image (or other media file).
5
+ #
6
+ # See [Wikipedia Tutorial](https://en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax)
7
+ # for explanation of attributes.
8
+ class Image < Node
9
+ def initialize(path, params = {})
10
+ @caption = params.delete(:caption)
11
+ super({path: path}.merge(params))
12
+ end
13
+
14
+ # Image caption. Can have (sometimes many) other nodes inside.
15
+ #
16
+ # @return [Nodes]
17
+ attr_reader :caption
18
+
19
+ # @!attribute [r] path
20
+ # @!attribute [r] type
21
+ # @!attribute [r] location
22
+ # @!attribute [r] alignment
23
+ # @!attribute [r] link
24
+ # @!attribute [r] alt
25
+
26
+ def_readers :path, :type,
27
+ :location, :alignment, :link,
28
+ :alt
29
+
30
+ def border?
31
+ !params[:border].to_s.empty?
32
+ end
33
+
34
+ def width
35
+ params[:width].to_i
36
+ end
37
+
38
+ def height
39
+ params[:height].to_i
40
+ end
41
+
42
+ def to_tree(level = 0)
43
+ super(level) +
44
+ if caption && !caption.empty?
45
+ indent(level+1) + "caption:\n" +
46
+ caption.map(&call(to_tree: level+2)).join
47
+ else
48
+ ''
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents italic text.
5
+ class Italic < Compound
6
+ end
7
+
8
+ # Represents bold text.
9
+ class Bold < Compound
10
+ end
11
+
12
+ # Represents bold italic text (and no, it's not a comb of bold+italic,
13
+ # from Wikipedia's markup point of view).
14
+ class BoldItalic < Compound
15
+ end
16
+
17
+ # Base class for internal/external links,
18
+ class Link < Compound
19
+ def initialize(link, label = nil)
20
+ super(label || Nodes.new([Text.new(link)]), link: link)
21
+ end
22
+
23
+ #@!attribute [r] link
24
+
25
+ def_readers :link
26
+ end
27
+
28
+ # External link. Has other nodes as a contents, and, err, link (url).
29
+ class ExternalLink < Link
30
+
31
+ #@!attribute [r] url
32
+ # synonym for `#link`
33
+
34
+ alias_method :url, :link
35
+ end
36
+ end
37
+ end
38
+
39
+ require_relative 'wikilink'
@@ -0,0 +1,160 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Tree
4
+ # Represents item of ordered or unordered list.
5
+ class ListItem < BaseParagraph
6
+ # Internal, used by {Parser}
7
+ def can_merge?(other)
8
+ other.class == self.class &&
9
+ other.children.first.kind_of?(List)
10
+ end
11
+
12
+ # Internal, used by {Parser}
13
+ def merge!(other)
14
+ ochildren = other.children.dup
15
+ if children.last && children.last.can_merge?(ochildren.first)
16
+ children.last.merge!(ochildren.shift)
17
+ end
18
+ push_children(*ochildren)
19
+ end
20
+
21
+ def text
22
+ make_marker + if children.last.is_a?(List)
23
+ children[0..-2].map(&:text).join + "\n" + children.last.text
24
+ else
25
+ children.map(&:text).join + "\n"
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def make_marker
32
+ parent ? parent.make_marker(self) : '* '
33
+ end
34
+ end
35
+
36
+ # "Imaginary" node, grouping {ListItem}s of same level and type.
37
+ #
38
+ # Base for concrete {OrderedList}, {UnorderedList} and {DefinitionList}.
39
+ #
40
+ # NB: Nested lists are represented by structures like:
41
+ #
42
+ # ```
43
+ # <OrderedList>
44
+ # <ListItem>
45
+ # <ListItem>
46
+ # <Text>
47
+ # <UnorderedList>
48
+ # <ListItem>
49
+ # <ListItem>
50
+ # ...and so on
51
+ # ```
52
+ class List < Compound
53
+ def list_level
54
+ lookup_parents(List).count
55
+ end
56
+
57
+ def list_text_indent
58
+ ' ' * list_level
59
+ end
60
+
61
+ def text
62
+ if list_level.zero?
63
+ super.sub(/\n+\Z/, "\n\n")
64
+ else
65
+ super.sub(/\n+\Z/, "\n")
66
+ end
67
+ end
68
+ end
69
+
70
+ # Represents unordered list (list with markers).
71
+ class UnorderedList < List
72
+ def make_marker(item)
73
+ list_text_indent + '* '
74
+ end
75
+ end
76
+
77
+ # Represents ordered list (list with numbers).
78
+ class OrderedList < List
79
+ def make_marker(item)
80
+ list_text_indent + "#{(item.index + 1)}. "
81
+ end
82
+ end
83
+
84
+ # Represents definitions list (`term: definition` structure),
85
+ # consists of {DTerm}s and {DDefinition}s.
86
+ #
87
+ # NB: In fact, at least in English Wikipedia, orphan "definition terms"
88
+ # are used as a low-level headers, especially in lists of links/references.
89
+ class DefinitionList < List
90
+ def make_marker(item)
91
+ case item
92
+ when DTerm
93
+ list_text_indent
94
+ when DDefinition
95
+ list_text_indent + ' '
96
+ end
97
+ end
98
+ end
99
+
100
+ # Term in {DefinitionList}
101
+ class DTerm < ListItem
102
+ def text
103
+ super.sub("\n", ":\n")
104
+ end
105
+ end
106
+
107
+ # Term definition in {DefinitionList}
108
+ class DDefinition < ListItem
109
+ end
110
+
111
+ class List < Compound
112
+ include Mergeable
113
+
114
+ # Internal, used by {Parser}
115
+ def merge!(other)
116
+ ochildren = other.children.dup
117
+ if children.last && ochildren.first &&
118
+ children.last.can_merge?(ochildren.first)
119
+
120
+ children.last.merge!(ochildren.shift)
121
+ end
122
+
123
+ push_children(*ochildren)
124
+ end
125
+
126
+ # Internal, used by {Parser}
127
+ def self.construct(marker, nodes)
128
+ m = marker.shift
129
+ klass = LISTS[m] or
130
+ fail("Something went wrong: undefined list marker type #{m}")
131
+ item_klass = ITEMS[m]
132
+
133
+ if marker.empty?
134
+ klass.new(item_klass.new(nodes))
135
+ else
136
+ klass.new(item_klass.new(construct(marker, nodes)))
137
+ end
138
+ end
139
+
140
+ private
141
+
142
+ # @private
143
+ LISTS = {
144
+ ';' => DefinitionList,
145
+ ':' => DefinitionList,
146
+ '*' => UnorderedList,
147
+ '#' => OrderedList
148
+ }
149
+
150
+ # @private
151
+ ITEMS = {
152
+ ';' => DTerm,
153
+ ':' => DDefinition,
154
+ '*' => ListItem,
155
+ '#' => ListItem
156
+ }
157
+
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,181 @@
1
+ # encoding: utf-8
2
+ require 'htmlentities'
3
+
4
+ module Infoboxer
5
+ module Tree
6
+ # This is the base class for all parse tree nodes.
7
+ #
8
+ # Basically, you'll
9
+ # never create instances of this class or its descendants by yourself,
10
+ # you will receive it from tree and use for navigations.
11
+ #
12
+ class Node
13
+ include ProcMe
14
+
15
+ def initialize(params = {})
16
+ @params = params
17
+ end
18
+
19
+ # Hash of node "params".
20
+ #
21
+ # Params notin is roughly the same as tag attributes in HTML. This
22
+ # is actual for complex nodes like images, tables, raw HTML tags and
23
+ # so on.
24
+ #
25
+ # The most actual params are typically exposed by node as instance
26
+ # methods (like {Heading#level}).
27
+ #
28
+ # @return [Hash]
29
+ attr_reader :params
30
+
31
+ # Node's parent in tree
32
+ # @return {Node}
33
+ attr_accessor :parent
34
+
35
+ def ==(other)
36
+ self.class == other.class && _eq(other)
37
+ end
38
+
39
+ # Position in parent's children array (zero-based)
40
+ def index
41
+ parent ? parent.index_of(self) : 0
42
+ end
43
+
44
+ # List of all sibling nodes (children of same parent)
45
+ def siblings
46
+ parent ? parent.children - [self] : Nodes[]
47
+ end
48
+
49
+ # List of siblings before this one
50
+ def prev_siblings
51
+ siblings.select{|n| n.index < index}
52
+ end
53
+
54
+ # List of siblings after this one
55
+ def next_siblings
56
+ siblings.select{|n| n.index > index}
57
+ end
58
+
59
+ # Node children list
60
+ def children
61
+ Nodes[] # redefined in descendants
62
+ end
63
+
64
+ # Used only during tree construction in {Parser}.
65
+ def can_merge?(other)
66
+ false
67
+ end
68
+
69
+ # Whether node is empty (definition of "empty" varies for different
70
+ # kinds of nodes). Used mainly in {Parser}.
71
+ def empty?
72
+ false
73
+ end
74
+
75
+ # Textual representation of this node and its children, ready for
76
+ # pretty-printing. Use it like this:
77
+ #
78
+ # ```ruby
79
+ # puts page.lookup(:Paragraph).first.to_tree
80
+ # # Prints something like
81
+ # # <Paragraph>
82
+ # # This <Italic>
83
+ # # is <Text>
84
+ # # <Wikilink(link: "Argentina")>
85
+ # # pretty <Italic>
86
+ # # complicated <Text>
87
+ # ```
88
+ #
89
+ # Useful for understanding page structure, and Infoboxer's representation
90
+ # of this structure
91
+ def to_tree(level = 0)
92
+ indent(level) + "<#{descr}>\n"
93
+ end
94
+
95
+ def inspect
96
+ text.empty? ? "#<#{descr}>" : "#<#{descr}: #{shorten_text}>"
97
+ end
98
+
99
+ # Node text representation. It is defined for all nodes so, that
100
+ # entire `Document#text` produce readable text-only representation
101
+ # of Wiki page. Therefore, rules are those:
102
+ # * inline-formatting nodes (text, bold, italics) just return the
103
+ # text;
104
+ # * paragraph-level nodes (headings, paragraphs, lists) add `"\n\n"`
105
+ # after text;
106
+ # * list items add marker before text;
107
+ # * nodes, not belonging to "main" text flow (references, templates)
108
+ # produce empty text.
109
+ #
110
+ # If you want just the text of some heading or list item (without
111
+ # "formatting" quircks), you can use {Node#text_} method.
112
+ #
113
+ def text
114
+ '' # redefined in descendants
115
+ end
116
+
117
+ # "Clean" version of node text: without trailing linefeeds, list
118
+ # markers and other things added for formatting.
119
+ #
120
+ def text_
121
+ text.strip
122
+ end
123
+
124
+ # See {Node#text_}
125
+ def to_s
126
+ # just aliases will not work when #text will be redefined in subclasses
127
+ text_
128
+ end
129
+
130
+ private
131
+
132
+ MAX_CHARS = 30
133
+
134
+ def shorten_text
135
+ text_.length > MAX_CHARS ? text_[0..MAX_CHARS] + '...' : text_
136
+ end
137
+
138
+ def clean_class
139
+ self.class.name.sub(/^.*::/, '')
140
+ end
141
+
142
+ def descr
143
+ if !params || params.empty?
144
+ "#{clean_class}"
145
+ else
146
+ "#{clean_class}(#{show_params})"
147
+ end
148
+ end
149
+
150
+ def show_params(prms = nil)
151
+ (prms || params).map{|k, v| "#{k}: #{v.inspect}"}.join(', ')
152
+ end
153
+
154
+ def indent(level)
155
+ ' ' * level
156
+ end
157
+
158
+ def _eq(other)
159
+ fail(NotImplementedError, "#_eq should be defined in subclasses")
160
+ end
161
+
162
+ def decode(str)
163
+ Node.coder.decode(str)
164
+ end
165
+
166
+ class << self
167
+ # Internal: descendandts DSL
168
+ def def_readers(*keys)
169
+ keys.each do |k|
170
+ define_method(k){ params[k] }
171
+ end
172
+ end
173
+
174
+ # Internal: HTML entities decoder.
175
+ def coder
176
+ @coder ||= HTMLEntities.new
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end