infoboxer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,162 @@
1
+ # encoding: utf-8
2
+ require 'rest-client'
3
+ require 'json'
4
+ require 'addressable/uri'
5
+
6
+ require_relative 'media_wiki/traits'
7
+ require_relative 'media_wiki/page'
8
+
9
+ module Infoboxer
10
+ # MediaWiki client class.
11
+ #
12
+ # Usage:
13
+ #
14
+ # ```ruby
15
+ # client = Infoboxer::MediaWiki.new('http://en.wikipedia.org/w/api.php', user_agent: 'My Own Project')
16
+ # page = client.get('Argentina')
17
+ # ```
18
+ #
19
+ # Consider using shortcuts like {Infoboxer.wiki}, {Infoboxer.wikipedia},
20
+ # {Infoboxer.wp} and so on instead of direct instation of this class
21
+ # (although you can if you want to!)
22
+ #
23
+ class MediaWiki
24
+ # Default Infoboxer User-Agent header.
25
+ #
26
+ # You can set yours as an option to {Infoboxer.wiki} and its shortcuts,
27
+ # or to {#initialize}
28
+ UA = "Infoboxer/#{Infoboxer::VERSION} (https://github.com/molybdenum-99/infoboxer; zverok.offline@gmail.com)"
29
+
30
+ class << self
31
+ # User agent getter/setter.
32
+ #
33
+ # Default value is {UA}.
34
+ #
35
+ # You can also use per-instance option, see {#initialize}
36
+ attr_accessor :user_agent
37
+ end
38
+
39
+ attr_reader :api_base_url
40
+
41
+ # Creating new MediaWiki client. {Infoboxer.wiki} provides shortcut
42
+ # for it, as well as shortcuts for some well-known wikis, like
43
+ # {Infoboxer.wikipedia}.
44
+ #
45
+ # @param api_base_url URL of `api.php` file in your MediaWiki
46
+ # installation. Typically, its `<domain>/w/api.php`, but can vary
47
+ # in different wikis.
48
+ # @param options Only one option is currently supported:
49
+ # * `:user_agent` (also aliased as `:ua`) -- custom User-Agent header.
50
+ def initialize(api_base_url, options = {})
51
+ @api_base_url = Addressable::URI.parse(api_base_url)
52
+ @resource = RestClient::Resource.new(api_base_url, headers: headers(options))
53
+ end
54
+
55
+ # Receive "raw" data from Wikipedia (without parsing or wrapping in
56
+ # classes).
57
+ #
58
+ # @return [Array<Hash>]
59
+ def raw(*titles)
60
+ postprocess @resource.get(
61
+ params: DEFAULT_PARAMS.merge(titles: titles.join('|'))
62
+ )
63
+ end
64
+
65
+ # Receive list of parsed wikipedia pages for list of titles provided.
66
+ # All pages are received with single query to MediaWiki API.
67
+ #
68
+ # **NB**: currently, if you are requesting more than 50 titles at
69
+ # once (MediaWiki limitation for single request), Infoboxer will
70
+ # **not** try to get other pages with subsequent queries. This will
71
+ # be fixed in future.
72
+ #
73
+ # @return [Tree::Nodes<Page>] array of parsed pages. Notes:
74
+ # * if you call `get` with only one title, one page will be
75
+ # returned instead of an array
76
+ # * if some of pages are not in wiki, they will not be returned,
77
+ # therefore resulting array can be shorter than titles array;
78
+ # you can always check `pages.map(&:title)` to see what you've
79
+ # really received; this approach allows you to write absent-minded
80
+ # code like this:
81
+ #
82
+ # ```ruby
83
+ # Infoboxer.wp.get('Argentina', 'Chile', 'Something non-existing').
84
+ # infobox.fetch('some value')
85
+ # ```
86
+ # and obtain meaningful results instead of NoMethodError or some
87
+ # NotFound.
88
+ #
89
+ def get(*titles)
90
+ pages = raw(*titles).reject{|raw| raw[:content].nil?}.
91
+ map{|raw|
92
+ traits = Traits.get(@api_base_url.host, extract_traits(raw))
93
+
94
+ Page.new(self,
95
+ Parser.paragraphs(raw[:content], traits),
96
+ raw.merge(traits: traits))
97
+ }
98
+ titles.count == 1 ? pages.first : Tree::Nodes[*pages]
99
+ end
100
+
101
+ private
102
+
103
+ # @private
104
+ PROP = [
105
+ 'revisions', # to extract content of the page
106
+ 'info', # to extract page canonical url
107
+ 'categories', # to extract default category prefix
108
+ 'images' # to extract default media prefix
109
+ ].join('|')
110
+
111
+ # @private
112
+ DEFAULT_PARAMS = {
113
+ action: :query,
114
+ format: :json,
115
+ redirects: true,
116
+
117
+ prop: PROP,
118
+ rvprop: :content,
119
+ inprop: :url,
120
+ }
121
+
122
+ def headers(options)
123
+ {'User-Agent' => options[:user_agent] || options[:ua] || self.class.user_agent || UA}
124
+ end
125
+
126
+ def extract_traits(raw)
127
+ raw.select{|k, v| [:file_prefix, :category_prefix].include?(k)}
128
+ end
129
+
130
+ def guess_traits(pages)
131
+ categories = pages.map{|p| p['categories']}.compact.flatten
132
+ images = pages.map{|p| p['images']}.compact.flatten
133
+ {
134
+ file_prefix: images.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
135
+ category_prefix: categories.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
136
+ }
137
+ end
138
+
139
+ def postprocess(response)
140
+ pages = JSON.parse(response)['query']['pages']
141
+ traits = guess_traits(pages.values)
142
+
143
+ pages.map{|id, data|
144
+ if id.to_i < 0
145
+ {
146
+ title: data['title'],
147
+ content: nil,
148
+ not_found: true
149
+ }
150
+ else
151
+ {
152
+ title: data['title'],
153
+ content: data['revisions'].first['*'],
154
+ url: data['fullurl'],
155
+ }.merge(traits)
156
+ end
157
+ }
158
+ rescue JSON::ParserError
159
+ fail RuntimeError, "Not a JSON response, seems there's not a MediaWiki API: #{@api_base_url}"
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,38 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class MediaWiki
4
+ # A descendant of {Tree::Document Document}, representing page,
5
+ # received from {MediaWiki} client.
6
+ #
7
+ # Alongside with document tree structure, knows document's title as
8
+ # represented by MediaWiki and human (non-API) URL.
9
+ class Page < Tree::Document
10
+ def initialize(client, children, raw)
11
+ @client = client
12
+ super(children, raw)
13
+ end
14
+
15
+ # Instance of {MediaWiki} which this page was received from
16
+ # @return {MediaWiki}
17
+ attr_reader :client
18
+
19
+ # @!attribute [r] title
20
+ # Page title.
21
+ # @return [String]
22
+
23
+ # @!attribute [r] url
24
+ # Page friendly URL.
25
+ # @return [String]
26
+
27
+ def_readers :title, :url, :traits
28
+
29
+ private
30
+
31
+ PARAMS_TO_INSPECT = [:url, :title, :domain]
32
+
33
+ def show_params
34
+ super(params.select{|k, v| PARAMS_TO_INSPECT.include?(k)})
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class MediaWiki
4
+ class Traits
5
+ class << self
6
+ def templates(&definition)
7
+ @templates ||= Templates::Set.new
8
+
9
+ return @templates unless definition
10
+
11
+ @templates.define(&definition)
12
+ end
13
+
14
+ # NB: explicitly store all domains in base Traits class
15
+ def domain(d)
16
+ Traits.domains.key?(d) and
17
+ fail(ArgumentError, "Domain binding redefinition: #{Traits.domains[d]}")
18
+
19
+ Traits.domains[d] = self
20
+ end
21
+
22
+ def get(domain, options = {})
23
+ cls = Traits.domains[domain]
24
+ cls ? cls.new(options) : Traits.new(options)
25
+ end
26
+
27
+ def domains
28
+ @domains ||= {}
29
+ end
30
+
31
+ def for(domain, &block)
32
+ Class.new(self, &block).domain(domain)
33
+ end
34
+
35
+ alias_method :default, :new
36
+ end
37
+
38
+ DEFAULTS = {
39
+ file_prefix: 'File',
40
+ category_prefix: 'Category'
41
+ }
42
+
43
+ def initialize(options = {})
44
+ @options = options
45
+ @file_prefix = [DEFAULTS[:file_prefix], options.delete(:file_prefix)].
46
+ flatten.compact.uniq
47
+ @category_prefix = [DEFAULTS[:category_prefix], options.delete(:category_prefix)].
48
+ flatten.compact.uniq
49
+ end
50
+
51
+ attr_reader :file_prefix, :category_prefix
52
+
53
+ #attr_accessor :re
54
+
55
+ def templates
56
+ self.class.templates
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,84 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ # Navigation is one of the things Infoboxer is proud about. It tries
4
+ # to be logical, unobtrusive and compact.
5
+ #
6
+ # There's several levels of navigation:
7
+ # * simple tree navigation;
8
+ # * navigational shortcuts;
9
+ # * logical structure navigation (sections).
10
+ #
11
+ # ## Simple tree navigation
12
+ #
13
+ # It's somewhat similar to XPath/CSS selectors you'll use to navigate
14
+ # through HTML DOM. It is represented (and documented) in {Lookup::Node}
15
+ # module. To show you the taste of it:
16
+ #
17
+ # ```ruby
18
+ # document.
19
+ # lookup(:Wikilink, text: /Chile/).
20
+ # lookup_parents(:Table){|t| t.params[:class] == 'wikitable'}.
21
+ # lookup_children(size: 3)
22
+ # ```
23
+ #
24
+ # ## Navigational shortcuts
25
+ #
26
+ # There is nothing too complicated, just pretty shortcuts over `lookup_*`
27
+ # methods, so, you can write just
28
+ #
29
+ # ```ruby
30
+ # document.paragraphs.last.wikilinks('Category')
31
+ # ```
32
+ # ...instead of
33
+ # ```ruby
34
+ # document.lookup(:Paragraph).last.lookup(:Wikilink, namespace: 'Category')
35
+ # ```
36
+ # ...and so on.
37
+ #
38
+ # Look into {Shortcuts::Node} documentation for list of shortcuts.
39
+ #
40
+ # ## Logical structure navigation
41
+ #
42
+ # MediaWiki page structure is flat, like HTML's (there's just sequence
43
+ # of headings and paragraphs). Though, for most tasks of information
44
+ # extraction it is usefult to think of page as a structure of nested
45
+ # sections. {Sections} module provides such ability. It treats document
46
+ # as an {Sections::Container#intro intro} and set of subsequent
47
+ # {Sections::Section section}s of same level, which, in turn, have inside
48
+ # they own intro and sections. Also, each node has
49
+ # {Sections::Node#in_sections #in_sections} method, returning all sections
50
+ # in which it is nested.
51
+ #
52
+ # The code with sections can feel like this:
53
+ #
54
+ # ```ruby
55
+ # page.sections('Culture' => 'Music').tables
56
+ # # or like this
57
+ # page.wikilinks.select{|link| link.in_sections.first.heading.text.include?('Culture')}
58
+ # ```
59
+ #
60
+ # See {Sections::Container} for downwards section navigation, and
61
+ # {Sections::Node} for upwards.
62
+ #
63
+ module Navigation
64
+ %w[lookup shortcuts sections].each do |nav|
65
+ require_relative "navigation/#{nav}"
66
+ end
67
+
68
+ class Tree::Node
69
+ include Navigation::Lookup::Node
70
+ include Navigation::Shortcuts::Node
71
+ include Navigation::Sections::Node
72
+ end
73
+
74
+ class Tree::Nodes
75
+ include Navigation::Lookup::Nodes
76
+ include Navigation::Shortcuts::Nodes
77
+ include Navigation::Sections::Nodes
78
+ end
79
+
80
+ class Tree::Document
81
+ include Navigation::Sections::Container
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,216 @@
1
+ # encoding: utf-8
2
+ require_relative 'selector'
3
+
4
+ module Infoboxer
5
+ module Navigation
6
+ # See {Lookup::Node Lookup::Node} for everything!
7
+ module Lookup
8
+ # `Lookup::Node` module provides methods for navigating through
9
+ # page tree in XPath-like manner.
10
+ #
11
+ # What you need to know about it:
12
+ #
13
+ # ## Selectors
14
+ #
15
+ # Each `lookup_*` method (and others similar) receive
16
+ # _list of selectors_. Examples of acceptable selectors:
17
+ #
18
+ # ```ruby
19
+ # # 1. Node class:
20
+ # document.lookup(Bold) # all Bolds
21
+ #
22
+ # # 2. Class symbol
23
+ # document.lookup(:Bold)
24
+ # # same as above, useful if you don't want to include Infoboxer::Tree
25
+ # # in all of your code or write things like lookup(Infoboxer::Tree::Bold)
26
+ #
27
+ # # 3. Getter/pattern:
28
+ # document.lookup(text: /something/)
29
+ # # finds all nodes where result of getter matches pattern
30
+ #
31
+ # # Checks against patterns are performed with `===`, so you can
32
+ # # use regexps to find by text, or ranges to find by number, like
33
+ # document.lookup(:Heading, level: (3..4))
34
+ #
35
+ # # Nodes where method is not defined are ignored, so you can
36
+ # # rewrite above example as just
37
+ # document.lookup(level: 3..4)
38
+ # # ...and receive meaningful result without any NoMethodError
39
+ #
40
+ # # 4. Check symbol
41
+ # document.lookup(:bold?)
42
+ # # finds all nodes for which `:bold?` is defined and returns
43
+ # # truthy value;
44
+ #
45
+ # # 5. Code block
46
+ # document.lookup{|node| node.params.has_key?(:class)}
47
+ # ```
48
+ #
49
+ # You also can use any of those method without **any** selector,
50
+ # thus receiving ALL parents, ALL children, ALL siblings and so on.
51
+ #
52
+ # ## Chainable navigation
53
+ #
54
+ # Each `lookup_*` method returns an instance of {Tree::Nodes} class,
55
+ # which behaves like an Array, but also defines similar set of
56
+ # `lookup_*` methods, so, you can brainlessly do the things like
57
+ #
58
+ # ```ruby
59
+ # document.
60
+ # lookup(:Paragraph){|p| p.text.length > 100}.
61
+ # lookup(:Wikilink, text: /^List of/).
62
+ # select(&:bold?)
63
+ # ```
64
+ #
65
+ # ## Underscored methods
66
+ #
67
+ # For all methods of this module you can notice "underscored" version
68
+ # (`lookup_children` vs `_lookup_children` and so on). Basically,
69
+ # underscored versions accept instance of {Lookup::Selector}, which
70
+ # is already preprocessed version of all selectors. It is kinda
71
+ # internal thing, though can be useful if you store selectors in
72
+ # variables -- it is easier to have and use just one instance of
73
+ # Selector, than list of arguments and blocks.
74
+ #
75
+ module Node
76
+ # @!method matches?(*selectors, &block)
77
+ # Checks if current node matches selectors.
78
+
79
+ # @!method lookup(*selectors, &block)
80
+ # Selects matching nodes from entire subtree inside current node.
81
+
82
+ # @!method lookup_children(*selectors, &block)
83
+ # Selects nodes only from this node's direct children.
84
+
85
+ # @!method lookup_parents(*selectors, &block)
86
+ # Selects matching nodes of this node's parents chain, up to
87
+ # entire {Tree::Document Document}.
88
+
89
+ # @!method lookup_siblings(*selectors, &block)
90
+ # Selects matching nodes from current node's siblings.
91
+
92
+ # @!method lookup_next_siblings(*selectors, &block)
93
+ # Selects matching nodes from current node's siblings, which
94
+ # are below current node in parents children list.
95
+
96
+ # @!method lookup_prev_siblings(*selectors, &block)
97
+ # Selects matching nodes from current node's siblings, which
98
+ # are above current node in parents children list.
99
+
100
+ # Underscored version of {#matches?}
101
+ def _matches?(selector)
102
+ selector.matches?(self)
103
+ end
104
+
105
+ # Underscored version of {#lookup}
106
+ def _lookup(selector)
107
+ Tree::Nodes[_matches?(selector) ? self : nil, *children._lookup(selector)].
108
+ flatten.compact
109
+ end
110
+
111
+ # Underscored version of {#lookup_children}
112
+ def _lookup_children(selector)
113
+ @children._find(selector)
114
+ end
115
+
116
+ # Underscored version of {#lookup_parents}
117
+ def _lookup_parents(selector)
118
+ case
119
+ when !parent
120
+ Tree::Nodes[]
121
+ when parent._matches?(selector)
122
+ Tree::Nodes[parent, *parent._lookup_parents(selector)]
123
+ else
124
+ parent._lookup_parents(selector)
125
+ end
126
+ end
127
+
128
+ # Underscored version of {#lookup_siblings}
129
+ def _lookup_siblings(selector)
130
+ siblings._find(selector)
131
+ end
132
+
133
+ # Underscored version of {#lookup_prev_siblings}
134
+ def _lookup_prev_siblings(selector)
135
+ prev_siblings._find(selector)
136
+ end
137
+
138
+ # Underscored version of {#lookup_next_siblings}
139
+ def _lookup_next_siblings(selector)
140
+ next_siblings._find(selector)
141
+ end
142
+
143
+ [:matches?,
144
+ :lookup, :lookup_children, :lookup_parents,
145
+ :lookup_siblings,
146
+ :lookup_next_siblings, :lookup_prev_siblings
147
+ ].map{|sym| [sym, :"_#{sym}"]}.each do |sym, underscored|
148
+
149
+ define_method(sym){|*args, &block|
150
+ send(underscored, Selector.new(*args, &block))
151
+ }
152
+ end
153
+
154
+ # Checks if node has any parent matching selectors.
155
+ def has_parent?(*selectors, &block)
156
+ !lookup_parents(*selectors, &block).empty?
157
+ end
158
+ end
159
+
160
+ # This module provides implementations for all `lookup_*` methods
161
+ # of {Lookup::Node} for be used on nodes list. Note, that all
162
+ # those methods return _flat_ list of results (so, if you have
163
+ # found several nodes, and then look for their siblings, you should
164
+ # not expect array of arrays -- just one array of nodes).
165
+ #
166
+ # See {Lookup::Node} for detailed lookups and selectors explanation.
167
+ module Nodes
168
+ # @!method lookup(*selectors, &block)
169
+ # @!method lookup_children(*selectors, &block)
170
+ # @!method lookup_parents(*selectors, &block)
171
+ # @!method lookup_siblings(*selectors, &block)
172
+ # @!method lookup_next_siblings(*selectors, &block)
173
+ # @!method lookup_prev_siblings(*selectors, &block)
174
+
175
+ # @!method _lookup(selector)
176
+ # @!method _lookup_children(selector)
177
+ # @!method _lookup_parents(selector)
178
+ # @!method _lookup_siblings(selector)
179
+ # @!method _lookup_next_siblings(selector)
180
+ # @!method _lookup_prev_siblings(selector)
181
+
182
+ # Underscored version of {#find}.
183
+ def _find(selector)
184
+ select{|n| n._matches?(selector)}
185
+ end
186
+
187
+ # Selects nodes of current list (and only it, no children checks),
188
+ # which are matching selectors.
189
+ def find(*selectors, &block)
190
+ _find(Selector.new(*selectors, &block))
191
+ end
192
+
193
+ [
194
+ :_lookup, :_lookup_children, :_lookup_parents,
195
+ :_lookup_siblings, :_lookup_prev_siblings, :_lookup_next_siblings
196
+ ].each do |sym|
197
+ define_method(sym){|*args|
198
+ make_nodes map{|n| n.send(sym, *args)}
199
+ }
200
+ end
201
+
202
+ # not delegate, but redefine: Selector should be constructed only once
203
+ [
204
+ :lookup, :lookup_children, :lookup_parents,
205
+ :lookup_siblings,
206
+ :lookup_next_siblings, :lookup_prev_siblings
207
+ ].map{|sym| [sym, :"_#{sym}"]}.each do |sym, underscored|
208
+
209
+ define_method(sym){|*args, &block|
210
+ send(underscored, Selector.new(*args, &block))
211
+ }
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end