infoboxer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,132 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ # http://en.wikipedia.org/wiki/Help:Table
5
+ module Table
6
+ include Tree
7
+
8
+ def table
9
+ @context.current =~ /^\s*{\|/ or
10
+ @context.fail!('Something went wrong: trying to parse not a table')
11
+
12
+ prms = table_params
13
+ table = Tree::Table.new(Nodes[], prms)
14
+
15
+ @context.next!
16
+
17
+ loop do
18
+ table_next_line(table) or break
19
+ @context.next!
20
+ end
21
+
22
+ # FIXME: not the most elegant way, huh?
23
+ table.children.reject!{|r| r.children.empty?}
24
+
25
+ table
26
+ end
27
+
28
+ def table_params
29
+ @context.skip(/\s*{\|/)
30
+ parse_params(@context.rest)
31
+ end
32
+
33
+ def table_next_line(table)
34
+ case @context.current
35
+ when /^\s*\|}(.*)$/ # table end
36
+ @context.scan(/^\s*\|}/)
37
+ return false # should not continue
38
+
39
+ when /^\s*!/ # heading (th) in a row
40
+ table_cells(table, TableHeading)
41
+
42
+ when /^\s*\|\+/ # caption
43
+ table_caption(table)
44
+
45
+ when /^\s*\|-(.*)$/ # row start
46
+ table_row(table, $1)
47
+
48
+ when /^\s*\|/ # cell in row
49
+ table_cells(table)
50
+
51
+ when /^\s*{{/ # template can be at row level
52
+ table_template(table)
53
+
54
+ when nil
55
+ @context.fail!("End of input before table ended!")
56
+
57
+ else
58
+ table_cell_cont(table)
59
+ end
60
+ true # should continue parsing
61
+ end
62
+
63
+ def table_row(table, param_str)
64
+ table.push_children(TableRow.new(Nodes[], parse_params(param_str)))
65
+ end
66
+
67
+ def table_caption(table)
68
+ @context.skip(/^\s*\|\+\s*/)
69
+
70
+ children = inline(/^\s*([|!]|{\|)/)
71
+ @context.prev! # compensate next! which will be done in table()
72
+ table.push_children(TableCaption.new(children.strip))
73
+ end
74
+
75
+ def table_cells(table, cell_class = TableCell)
76
+ table.push_children(TableRow.new()) unless table.children.last.is_a?(TableRow)
77
+ row = table.children.last
78
+
79
+ @context.skip(/\s*[!|]\s*/)
80
+ guarded_loop do
81
+ if @context.check(/[^|{|\[]+\|([^\|]|$)/)
82
+ params = parse_params(@context.scan_until(/\|/))
83
+ else
84
+ params = {}
85
+ end
86
+ content = short_inline(/(\|\||!!)/)
87
+ row.push_children(cell_class.new(content, params))
88
+ break if @context.eol?
89
+ end
90
+ end
91
+
92
+ def table_template(table)
93
+ contents = paragraph(/^\s*([|!]|{\|)/).to_templates?
94
+
95
+ if (row = table.children.last).is_a?(TableRow)
96
+ if (cell = row.children.last).is_a?(BaseCell)
97
+ cell.push_children(*contents)
98
+ else
99
+ row.push_children(*contents)
100
+ end
101
+ else
102
+ table.push_children(*contents)
103
+ end
104
+ end
105
+
106
+ # On-the-fly TableCaption creation handles (real life) case, when
107
+ # table has "HTML caption":
108
+ # {|
109
+ # <caption>....</caption>
110
+ #
111
+ # Solution is NOT elegant or semantically "right", yet it works.
112
+ # Somehow.
113
+ #
114
+ def table_cell_cont(table)
115
+ container = case (last = table.children.last)
116
+ when TableRow
117
+ cell = last.children.last
118
+ cell.is_a?(BaseCell) ? cell : TableCaption.new
119
+ when TableCaption
120
+ last
121
+ when nil
122
+ TableCaption.new
123
+ else
124
+ @context.fail!("Multiline cell inside #{last}")
125
+ end
126
+
127
+ container.push_children(paragraph(/^\s*([|!]|{\|)/))
128
+ table.push_children(container) unless container.parent
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Template
5
+ include Tree
6
+
7
+ # NB: here we are not distingish templates like {{Infobox|variable}}
8
+ # and "magic words" like {{formatnum:123}}
9
+ # Just calling all of them "templates". This behaviour will change
10
+ # in future, I presume
11
+ # More about magic words: https://www.mediawiki.org/wiki/Help:Magic_words
12
+ def template
13
+ name = @context.scan_continued_until(/\||:|}}/) or
14
+ @context.fail!("Template name not found")
15
+
16
+ name.strip!
17
+ vars = @context.eat_matched?('}}') ? Nodes[] : template_vars
18
+ @context.traits.templates.find(name).new(name, vars)
19
+ end
20
+
21
+ def template_vars
22
+ num = 1
23
+ res = Nodes[]
24
+
25
+ guarded_loop do
26
+ if @context.check(/\s*([^ =}|]+)\s*=\s*/)
27
+ name = @context.scan(/\s*([^ =]+)/).strip
28
+ @context.skip(/\s*=\s*/)
29
+ else
30
+ name = num
31
+ end
32
+
33
+ value = long_inline(/\||}}/)
34
+ unless value.empty? && name.is_a?(Numeric) # it was just empty line otherwise
35
+ res << Var.new(name.to_s, value)
36
+ end
37
+
38
+ break if @context.eat_matched?('}}')
39
+ @context.eof? and @context.fail!("Unexpected break of template variables: #{res}")
40
+
41
+ num += 1
42
+ end
43
+ res
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Util
5
+ attr_reader :re
6
+
7
+ FORMATTING = /(
8
+ '{2,5} | # bold, italic
9
+ \[\[ | # link
10
+ {{ | # template
11
+ \[[a-z]+:\/\/ | # external link
12
+ <nowiki[^>]*> | # reference
13
+ <ref[^>]*> | # nowiki
14
+ < # HTML tag
15
+ )/x
16
+
17
+ INLINE_EOL = %r[(?= # if we have ahead... (not scanned, just checked
18
+ </ref> | # <ref> closed
19
+ }} # or template closed
20
+ )]x
21
+
22
+
23
+ def make_regexps
24
+ {
25
+ file_prefix: /(#{@context.traits.file_prefix.join('|')}):/,
26
+ formatting: FORMATTING,
27
+ inline_until_cache: Hash.new{|h, r|
28
+ h[r] = Regexp.union(*[r, FORMATTING, /$/].compact.uniq)
29
+ },
30
+ short_inline_until_cache: Hash.new{|h, r|
31
+ h[r] = Regexp.union(*[r, INLINE_EOL, FORMATTING, /$/].compact.uniq)
32
+ }
33
+ }
34
+ end
35
+
36
+ def parse_params(str)
37
+ return {} unless str
38
+
39
+ scan = StringScanner.new(str)
40
+ params = {}
41
+ loop do
42
+ scan.skip(/\s*/)
43
+ name = scan.scan(/[^ \t=]+/) or break
44
+ scan.skip(/\s*/)
45
+ if scan.peek(1) == '='
46
+ scan.skip(/=\s*/)
47
+ q = scan.scan(/['"]/)
48
+ if q
49
+ value = scan.scan_until(/#{q}/).sub(q, '')
50
+ else
51
+ value = scan.scan_until(/\s|$/)
52
+ end
53
+ params[name.to_sym] = value
54
+ else
55
+ params[name.to_sym] = name
56
+ end
57
+ end
58
+ params
59
+ end
60
+
61
+ def guarded_loop
62
+ loop do
63
+ pos_before = @context.lineno, @context.colno
64
+ yield
65
+ pos_after = @context.lineno, @context.colno
66
+ pos_after == pos_before and
67
+ @context.fail!("Infinite loop on position #{pos_after.last}")
68
+ end
69
+ end
70
+
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,10 @@
1
+ module Infoboxer
2
+ # Templates are cool, powerful and undocumented. Sorry :(
3
+ #
4
+ # I do my best.
5
+ module Templates
6
+ %w[base set].each do |tmpl|
7
+ require_relative "templates/#{tmpl}"
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,82 @@
1
+ module Infoboxer
2
+ module Templates
3
+ class Base < Tree::Template
4
+ include Tree
5
+
6
+ class << self
7
+ attr_accessor :template_name, :template_options
8
+
9
+ def inspect
10
+ template_name ? "Infoboxer::Templates::#{clean_name}" : super
11
+ end
12
+
13
+ def clean_name
14
+ template_name ? "Template[#{template_name}]" : 'Template'
15
+ end
16
+ end
17
+
18
+ def unnamed_variables
19
+ variables.select{|v| v.name =~ /^\d+$/}
20
+ end
21
+
22
+ def fetch(*patterns)
23
+ Nodes[*patterns.map{|p| variables.find(name: p)}.flatten]
24
+ end
25
+
26
+ def fetch_hash(*patterns)
27
+ fetch(*patterns).map{|v| [v.name, v]}.to_h
28
+ end
29
+
30
+ def fetch_date(*patterns)
31
+ components = fetch(*patterns)
32
+ components.pop while components.last.nil? && !components.empty?
33
+
34
+ if components.empty?
35
+ nil
36
+ else
37
+ Date.new(*components.map{|v| v.to_s.to_i})
38
+ end
39
+ end
40
+
41
+ def ==(other)
42
+ other.kind_of?(Tree::Template) && _eq(other)
43
+ end
44
+
45
+ protected
46
+
47
+ def clean_class
48
+ if self.class.template_name == name
49
+ self.class.clean_name
50
+ else
51
+ super
52
+ end
53
+ end
54
+ end
55
+
56
+ # Renders all of its unnamed variables as space-separated text
57
+ # Also allows in-template navigation
58
+ class Show < Base
59
+ alias_method :children, :unnamed_variables
60
+
61
+ protected
62
+
63
+ def children_separator
64
+ ' '
65
+ end
66
+ end
67
+
68
+ class Replace < Base
69
+ def replace
70
+ fail(NotImplementedError, "Descendants should define :replace")
71
+ end
72
+
73
+ def text
74
+ replace
75
+ end
76
+ end
77
+
78
+ class Literal < Base
79
+ alias_method :text, :name
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ module Templates
4
+ class Set
5
+ def initialize(&definitions)
6
+ @templates = []
7
+ define(&definitions) if definitions
8
+ end
9
+
10
+ def find(name)
11
+ _, template = @templates.detect{|m, t| m === name.downcase}
12
+ template || Base
13
+ end
14
+
15
+ def define(&definitions)
16
+ instance_eval(&definitions)
17
+ end
18
+
19
+ def clear
20
+ @templates.clear
21
+ end
22
+
23
+ private
24
+
25
+ def template(name, options = {}, &definition)
26
+ setup_class(name, Base, options, &definition)
27
+ end
28
+
29
+ def replace(*replacements)
30
+ case
31
+ when replacements.count == 2 && replacements.all?{|r| r.is_a?(String)}
32
+ name, what = *replacements
33
+ setup_class(name, Replace) do
34
+ define_method(:replace) do
35
+ what
36
+ end
37
+ end
38
+ when replacements.count == 1 && replacements.first.is_a?(Hash)
39
+ replacements.first.each do |name, what|
40
+ replace(name, what)
41
+ end
42
+ else
43
+ fail(ArgumentError, "Can't call :replace with #{replacements.join(', ')}")
44
+ end
45
+ end
46
+
47
+ def show(*names)
48
+ names.each do |name|
49
+ setup_class(name, Show)
50
+ end
51
+ end
52
+
53
+ def literal(*names)
54
+ names.each do |name|
55
+ setup_class(name, Literal)
56
+ end
57
+ end
58
+
59
+ def setup_class(name, base_class, options = {}, &definition)
60
+ match = options.fetch(:match, name.downcase)
61
+ base = options.fetch(:base, base_class)
62
+ base = self.find(base) if base.is_a?(String)
63
+
64
+ Class.new(base, &definition).tap{|cls|
65
+ cls.template_name = name
66
+ cls.template_options = options
67
+ @templates.unshift [match, cls]
68
+ }
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,70 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ # Infoboxer provides you with tree structure of the Wikipedia page,
4
+ # which you can introspect and navigate with ease. This tree structure
5
+ # tries to be simple, close to Wikipedia source and logical.
6
+ #
7
+ # You can always inspect entire page tree yourself:
8
+ #
9
+ # ```ruby
10
+ # page = Infoboxer.wp.get('Argentina')
11
+ # puts page.to_tree
12
+ # ```
13
+ #
14
+ # ## Inspecting and understanding single node
15
+ #
16
+ # Each tree node is descendant of {Tree::Node}, so you should look
17
+ # at this class to understand what you can do.
18
+ #
19
+ # Alongside with basic methods, defined in Node class, some useful
20
+ # utility methods are defined in subclasses.
21
+ #
22
+ # Here's full list of subclasses, representing real nodes, with their
23
+ # respective roles:
24
+ #
25
+ # * inline markup: {Text}, {Bold}, {Italic}, {BoldItalic}, {Wikilink},
26
+ # {ExternalLink}, {Image};
27
+ # * embedded HTML: {HTMLTag}, {HTMLOpeningTag}, {HTMLClosingTag};
28
+ # * paragraph-level nodes: {Heading}, {Paragraph}, {Pre}, {HR};
29
+ # * lists: {OrderedList}, {UnorderedList}, {DefinitionList}, {ListItem},
30
+ # {DTerm}, {DDefinition};
31
+ # * tables: {Table}, {TableCaption}, {TableRow}, {TableHeading}, {TableCell};
32
+ # * special elements: {Template}, {Ref}.
33
+ #
34
+ # ## Tree navigation
35
+ #
36
+ # {Tree::Node} class has a standard list of methods for traversing tree
37
+ # upwards, downwards and sideways: `children`, `parent`, `siblings`,
38
+ # `index`. Read through class documentation for their detailed
39
+ # descriptions.
40
+ #
41
+ # {Navigation} module contains more advanced navigational functionality,
42
+ # like XPath-like selectors, friendly shortcuts, breakup of document
43
+ # into logical "sections" and so on.
44
+ #
45
+ # Most of navigational and other Node's methods return {Nodes} type,
46
+ # which is an `Array` descendant with additional functionality.
47
+ #
48
+ # ## Complex data extraction
49
+ #
50
+ # Most of uniform, machine-extractable data in Wikipedia is stored in
51
+ # templates and tables. There's entire {Templates} module, which is
52
+ # documented explaining what you can do about Wikipedia templates, how
53
+ # to understand them and use information. Also, you can look at {Table}
54
+ # class, which for now is not that powerful, yet allows you to extract
55
+ # some columns and rows.
56
+ #
57
+ # Also, consider that WIKIpedia is maid of WIKIlinks, and {Wikilink#follow}
58
+ # (as well as {Nodes#follow} for multiple links at once) is you good friend.
59
+ #
60
+ module Tree
61
+ require_relative 'tree/node'
62
+ require_relative 'tree/nodes'
63
+
64
+ %w[text compound inline
65
+ image html paragraphs list template table ref
66
+ document].each do |type|
67
+ require_relative "tree/#{type}"
68
+ end
69
+ end
70
+ end