infoboxer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,132 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ # http://en.wikipedia.org/wiki/Help:Table
5
+ module Table
6
+ include Tree
7
+
8
+ def table
9
+ @context.current =~ /^\s*{\|/ or
10
+ @context.fail!('Something went wrong: trying to parse not a table')
11
+
12
+ prms = table_params
13
+ table = Tree::Table.new(Nodes[], prms)
14
+
15
+ @context.next!
16
+
17
+ loop do
18
+ table_next_line(table) or break
19
+ @context.next!
20
+ end
21
+
22
+ # FIXME: not the most elegant way, huh?
23
+ table.children.reject!{|r| r.children.empty?}
24
+
25
+ table
26
+ end
27
+
28
+ def table_params
29
+ @context.skip(/\s*{\|/)
30
+ parse_params(@context.rest)
31
+ end
32
+
33
+ def table_next_line(table)
34
+ case @context.current
35
+ when /^\s*\|}(.*)$/ # table end
36
+ @context.scan(/^\s*\|}/)
37
+ return false # should not continue
38
+
39
+ when /^\s*!/ # heading (th) in a row
40
+ table_cells(table, TableHeading)
41
+
42
+ when /^\s*\|\+/ # caption
43
+ table_caption(table)
44
+
45
+ when /^\s*\|-(.*)$/ # row start
46
+ table_row(table, $1)
47
+
48
+ when /^\s*\|/ # cell in row
49
+ table_cells(table)
50
+
51
+ when /^\s*{{/ # template can be at row level
52
+ table_template(table)
53
+
54
+ when nil
55
+ @context.fail!("End of input before table ended!")
56
+
57
+ else
58
+ table_cell_cont(table)
59
+ end
60
+ true # should continue parsing
61
+ end
62
+
63
+ def table_row(table, param_str)
64
+ table.push_children(TableRow.new(Nodes[], parse_params(param_str)))
65
+ end
66
+
67
+ def table_caption(table)
68
+ @context.skip(/^\s*\|\+\s*/)
69
+
70
+ children = inline(/^\s*([|!]|{\|)/)
71
+ @context.prev! # compensate next! which will be done in table()
72
+ table.push_children(TableCaption.new(children.strip))
73
+ end
74
+
75
+ def table_cells(table, cell_class = TableCell)
76
+ table.push_children(TableRow.new()) unless table.children.last.is_a?(TableRow)
77
+ row = table.children.last
78
+
79
+ @context.skip(/\s*[!|]\s*/)
80
+ guarded_loop do
81
+ if @context.check(/[^|{|\[]+\|([^\|]|$)/)
82
+ params = parse_params(@context.scan_until(/\|/))
83
+ else
84
+ params = {}
85
+ end
86
+ content = short_inline(/(\|\||!!)/)
87
+ row.push_children(cell_class.new(content, params))
88
+ break if @context.eol?
89
+ end
90
+ end
91
+
92
+ def table_template(table)
93
+ contents = paragraph(/^\s*([|!]|{\|)/).to_templates?
94
+
95
+ if (row = table.children.last).is_a?(TableRow)
96
+ if (cell = row.children.last).is_a?(BaseCell)
97
+ cell.push_children(*contents)
98
+ else
99
+ row.push_children(*contents)
100
+ end
101
+ else
102
+ table.push_children(*contents)
103
+ end
104
+ end
105
+
106
+ # On-the-fly TableCaption creation handles (real life) case, when
107
+ # table has "HTML caption":
108
+ # {|
109
+ # <caption>....</caption>
110
+ #
111
+ # Solution is NOT elegant or semantically "right", yet it works.
112
+ # Somehow.
113
+ #
114
+ def table_cell_cont(table)
115
+ container = case (last = table.children.last)
116
+ when TableRow
117
+ cell = last.children.last
118
+ cell.is_a?(BaseCell) ? cell : TableCaption.new
119
+ when TableCaption
120
+ last
121
+ when nil
122
+ TableCaption.new
123
+ else
124
+ @context.fail!("Multiline cell inside #{last}")
125
+ end
126
+
127
+ container.push_children(paragraph(/^\s*([|!]|{\|)/))
128
+ table.push_children(container) unless container.parent
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Template
5
+ include Tree
6
+
7
+ # NB: here we are not distingish templates like {{Infobox|variable}}
8
+ # and "magic words" like {{formatnum:123}}
9
+ # Just calling all of them "templates". This behaviour will change
10
+ # in future, I presume
11
+ # More about magic words: https://www.mediawiki.org/wiki/Help:Magic_words
12
+ def template
13
+ name = @context.scan_continued_until(/\||:|}}/) or
14
+ @context.fail!("Template name not found")
15
+
16
+ name.strip!
17
+ vars = @context.eat_matched?('}}') ? Nodes[] : template_vars
18
+ @context.traits.templates.find(name).new(name, vars)
19
+ end
20
+
21
+ def template_vars
22
+ num = 1
23
+ res = Nodes[]
24
+
25
+ guarded_loop do
26
+ if @context.check(/\s*([^ =}|]+)\s*=\s*/)
27
+ name = @context.scan(/\s*([^ =]+)/).strip
28
+ @context.skip(/\s*=\s*/)
29
+ else
30
+ name = num
31
+ end
32
+
33
+ value = long_inline(/\||}}/)
34
+ unless value.empty? && name.is_a?(Numeric) # it was just empty line otherwise
35
+ res << Var.new(name.to_s, value)
36
+ end
37
+
38
+ break if @context.eat_matched?('}}')
39
+ @context.eof? and @context.fail!("Unexpected break of template variables: #{res}")
40
+
41
+ num += 1
42
+ end
43
+ res
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Util
5
+ attr_reader :re
6
+
7
+ FORMATTING = /(
8
+ '{2,5} | # bold, italic
9
+ \[\[ | # link
10
+ {{ | # template
11
+ \[[a-z]+:\/\/ | # external link
12
+ <nowiki[^>]*> | # reference
13
+ <ref[^>]*> | # nowiki
14
+ < # HTML tag
15
+ )/x
16
+
17
+ INLINE_EOL = %r[(?= # if we have ahead... (not scanned, just checked
18
+ </ref> | # <ref> closed
19
+ }} # or template closed
20
+ )]x
21
+
22
+
23
+ def make_regexps
24
+ {
25
+ file_prefix: /(#{@context.traits.file_prefix.join('|')}):/,
26
+ formatting: FORMATTING,
27
+ inline_until_cache: Hash.new{|h, r|
28
+ h[r] = Regexp.union(*[r, FORMATTING, /$/].compact.uniq)
29
+ },
30
+ short_inline_until_cache: Hash.new{|h, r|
31
+ h[r] = Regexp.union(*[r, INLINE_EOL, FORMATTING, /$/].compact.uniq)
32
+ }
33
+ }
34
+ end
35
+
36
+ def parse_params(str)
37
+ return {} unless str
38
+
39
+ scan = StringScanner.new(str)
40
+ params = {}
41
+ loop do
42
+ scan.skip(/\s*/)
43
+ name = scan.scan(/[^ \t=]+/) or break
44
+ scan.skip(/\s*/)
45
+ if scan.peek(1) == '='
46
+ scan.skip(/=\s*/)
47
+ q = scan.scan(/['"]/)
48
+ if q
49
+ value = scan.scan_until(/#{q}/).sub(q, '')
50
+ else
51
+ value = scan.scan_until(/\s|$/)
52
+ end
53
+ params[name.to_sym] = value
54
+ else
55
+ params[name.to_sym] = name
56
+ end
57
+ end
58
+ params
59
+ end
60
+
61
+ def guarded_loop
62
+ loop do
63
+ pos_before = @context.lineno, @context.colno
64
+ yield
65
+ pos_after = @context.lineno, @context.colno
66
+ pos_after == pos_before and
67
+ @context.fail!("Infinite loop on position #{pos_after.last}")
68
+ end
69
+ end
70
+
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,10 @@
1
+ module Infoboxer
2
+ # Templates are cool, powerful and undocumented. Sorry :(
3
+ #
4
+ # I do my best.
5
+ module Templates
6
+ %w[base set].each do |tmpl|
7
+ require_relative "templates/#{tmpl}"
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,82 @@
1
+ module Infoboxer
2
+ module Templates
3
+ class Base < Tree::Template
4
+ include Tree
5
+
6
+ class << self
7
+ attr_accessor :template_name, :template_options
8
+
9
+ def inspect
10
+ template_name ? "Infoboxer::Templates::#{clean_name}" : super
11
+ end
12
+
13
+ def clean_name
14
+ template_name ? "Template[#{template_name}]" : 'Template'
15
+ end
16
+ end
17
+
18
+ def unnamed_variables
19
+ variables.select{|v| v.name =~ /^\d+$/}
20
+ end
21
+
22
+ def fetch(*patterns)
23
+ Nodes[*patterns.map{|p| variables.find(name: p)}.flatten]
24
+ end
25
+
26
+ def fetch_hash(*patterns)
27
+ fetch(*patterns).map{|v| [v.name, v]}.to_h
28
+ end
29
+
30
+ def fetch_date(*patterns)
31
+ components = fetch(*patterns)
32
+ components.pop while components.last.nil? && !components.empty?
33
+
34
+ if components.empty?
35
+ nil
36
+ else
37
+ Date.new(*components.map{|v| v.to_s.to_i})
38
+ end
39
+ end
40
+
41
+ def ==(other)
42
+ other.kind_of?(Tree::Template) && _eq(other)
43
+ end
44
+
45
+ protected
46
+
47
+ def clean_class
48
+ if self.class.template_name == name
49
+ self.class.clean_name
50
+ else
51
+ super
52
+ end
53
+ end
54
+ end
55
+
56
+ # Renders all of its unnamed variables as space-separated text
57
+ # Also allows in-template navigation
58
+ class Show < Base
59
+ alias_method :children, :unnamed_variables
60
+
61
+ protected
62
+
63
+ def children_separator
64
+ ' '
65
+ end
66
+ end
67
+
68
+ class Replace < Base
69
+ def replace
70
+ fail(NotImplementedError, "Descendants should define :replace")
71
+ end
72
+
73
+ def text
74
+ replace
75
+ end
76
+ end
77
+
78
+ class Literal < Base
79
+ alias_method :text, :name
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Templates
4
+ class Set
5
+ def initialize(&definitions)
6
+ @templates = []
7
+ define(&definitions) if definitions
8
+ end
9
+
10
+ def find(name)
11
+ _, template = @templates.detect{|m, t| m === name.downcase}
12
+ template || Base
13
+ end
14
+
15
+ def define(&definitions)
16
+ instance_eval(&definitions)
17
+ end
18
+
19
+ def clear
20
+ @templates.clear
21
+ end
22
+
23
+ private
24
+
25
+ def template(name, options = {}, &definition)
26
+ setup_class(name, Base, options, &definition)
27
+ end
28
+
29
+ def replace(*replacements)
30
+ case
31
+ when replacements.count == 2 && replacements.all?{|r| r.is_a?(String)}
32
+ name, what = *replacements
33
+ setup_class(name, Replace) do
34
+ define_method(:replace) do
35
+ what
36
+ end
37
+ end
38
+ when replacements.count == 1 && replacements.first.is_a?(Hash)
39
+ replacements.first.each do |name, what|
40
+ replace(name, what)
41
+ end
42
+ else
43
+ fail(ArgumentError, "Can't call :replace with #{replacements.join(', ')}")
44
+ end
45
+ end
46
+
47
+ def show(*names)
48
+ names.each do |name|
49
+ setup_class(name, Show)
50
+ end
51
+ end
52
+
53
+ def literal(*names)
54
+ names.each do |name|
55
+ setup_class(name, Literal)
56
+ end
57
+ end
58
+
59
+ def setup_class(name, base_class, options = {}, &definition)
60
+ match = options.fetch(:match, name.downcase)
61
+ base = options.fetch(:base, base_class)
62
+ base = self.find(base) if base.is_a?(String)
63
+
64
+ Class.new(base, &definition).tap{|cls|
65
+ cls.template_name = name
66
+ cls.template_options = options
67
+ @templates.unshift [match, cls]
68
+ }
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,70 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ # Infoboxer provides you with tree structure of the Wikipedia page,
4
+ # which you can introspect and navigate with ease. This tree structure
5
+ # tries to be simple, close to Wikipedia source and logical.
6
+ #
7
+ # You can always inspect entire page tree yourself:
8
+ #
9
+ # ```ruby
10
+ # page = Infoboxer.wp.get('Argentina')
11
+ # puts page.to_tree
12
+ # ```
13
+ #
14
+ # ## Inspecting and understanding single node
15
+ #
16
+ # Each tree node is descendant of {Tree::Node}, so you should look
17
+ # at this class to understand what you can do.
18
+ #
19
+ # Alongside with basic methods, defined in Node class, some useful
20
+ # utility methods are defined in subclasses.
21
+ #
22
+ # Here's full list of subclasses, representing real nodes, with their
23
+ # respective roles:
24
+ #
25
+ # * inline markup: {Text}, {Bold}, {Italic}, {BoldItalic}, {Wikilink},
26
+ # {ExternalLink}, {Image};
27
+ # * embedded HTML: {HTMLTag}, {HTMLOpeningTag}, {HTMLClosingTag};
28
+ # * paragraph-level nodes: {Heading}, {Paragraph}, {Pre}, {HR};
29
+ # * lists: {OrderedList}, {UnorderedList}, {DefinitionList}, {ListItem},
30
+ # {DTerm}, {DDefinition};
31
+ # * tables: {Table}, {TableCaption}, {TableRow}, {TableHeading}, {TableCell};
32
+ # * special elements: {Template}, {Ref}.
33
+ #
34
+ # ## Tree navigation
35
+ #
36
+ # {Tree::Node} class has a standard list of methods for traversing tree
37
+ # upwards, downwards and sideways: `children`, `parent`, `siblings`,
38
+ # `index`. Read through class documentation for their detailed
39
+ # descriptions.
40
+ #
41
+ # {Navigation} module contains more advanced navigational functionality,
42
+ # like XPath-like selectors, friendly shortcuts, breakup of document
43
+ # into logical "sections" and so on.
44
+ #
45
+ # Most of navigational and other Node's methods return {Nodes} type,
46
+ # which is an `Array` descendant with additional functionality.
47
+ #
48
+ # ## Complex data extraction
49
+ #
50
+ # Most of uniform, machine-extractable data in Wikipedia is stored in
51
+ # templates and tables. There's entire {Templates} module, which is
52
+ # documented explaining what you can do about Wikipedia templates, how
53
+ # to understand them and use information. Also, you can look at {Table}
54
+ # class, which for now is not that powerful, yet allows you to extract
55
+ # some columns and rows.
56
+ #
57
+ # Also, consider that WIKIpedia is maid of WIKIlinks, and {Wikilink#follow}
58
+ # (as well as {Nodes#follow} for multiple links at once) is you good friend.
59
+ #
60
+ module Tree
61
+ require_relative 'tree/node'
62
+ require_relative 'tree/nodes'
63
+
64
+ %w[text compound inline
65
+ image html paragraphs list template table ref
66
+ document].each do |type|
67
+ require_relative "tree/#{type}"
68
+ end
69
+ end
70
+ end