infoboxer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,162 @@
1
+ # encoding: utf-8
2
+ require 'rest-client'
3
+ require 'json'
4
+ require 'addressable/uri'
5
+
6
+ require_relative 'media_wiki/traits'
7
+ require_relative 'media_wiki/page'
8
+
9
+ module Infoboxer
10
+ # MediaWiki client class.
11
+ #
12
+ # Usage:
13
+ #
14
+ # ```ruby
15
+ # client = Infoboxer::MediaWiki.new('http://en.wikipedia.org/w/api.php', user_agent: 'My Own Project')
16
+ # page = client.get('Argentina')
17
+ # ```
18
+ #
19
+ # Consider using shortcuts like {Infoboxer.wiki}, {Infoboxer.wikipedia},
20
+ # {Infoboxer.wp} and so on instead of direct instation of this class
21
+ # (although you can if you want to!)
22
+ #
23
+ class MediaWiki
24
+ # Default Infoboxer User-Agent header.
25
+ #
26
+ # You can set yours as an option to {Infoboxer.wiki} and its shortcuts,
27
+ # or to {#initialize}
28
+ UA = "Infoboxer/#{Infoboxer::VERSION} (https://github.com/molybdenum-99/infoboxer; zverok.offline@gmail.com)"
29
+
30
+ class << self
31
+ # User agent getter/setter.
32
+ #
33
+ # Default value is {UA}.
34
+ #
35
+ # You can also use per-instance option, see {#initialize}
36
+ attr_accessor :user_agent
37
+ end
38
+
39
+ attr_reader :api_base_url
40
+
41
+ # Creating new MediaWiki client. {Infoboxer.wiki} provides shortcut
42
+ # for it, as well as shortcuts for some well-known wikis, like
43
+ # {Infoboxer.wikipedia}.
44
+ #
45
+ # @param api_base_url URL of `api.php` file in your MediaWiki
46
+ # installation. Typically, its `<domain>/w/api.php`, but can vary
47
+ # in different wikis.
48
+ # @param options Only one option is currently supported:
49
+ # * `:user_agent` (also aliased as `:ua`) -- custom User-Agent header.
50
+ def initialize(api_base_url, options = {})
51
+ @api_base_url = Addressable::URI.parse(api_base_url)
52
+ @resource = RestClient::Resource.new(api_base_url, headers: headers(options))
53
+ end
54
+
55
+ # Receive "raw" data from Wikipedia (without parsing or wrapping in
56
+ # classes).
57
+ #
58
+ # @return [Array<Hash>]
59
+ def raw(*titles)
60
+ postprocess @resource.get(
61
+ params: DEFAULT_PARAMS.merge(titles: titles.join('|'))
62
+ )
63
+ end
64
+
65
+ # Receive list of parsed wikipedia pages for list of titles provided.
66
+ # All pages are received with single query to MediaWiki API.
67
+ #
68
+ # **NB**: currently, if you are requesting more than 50 titles at
69
+ # once (MediaWiki limitation for single request), Infoboxer will
70
+ # **not** try to get other pages with subsequent queries. This will
71
+ # be fixed in future.
72
+ #
73
+ # @return [Tree::Nodes<Page>] array of parsed pages. Notes:
74
+ # * if you call `get` with only one title, one page will be
75
+ # returned instead of an array
76
+ # * if some of pages are not in wiki, they will not be returned,
77
+ # therefore resulting array can be shorter than titles array;
78
+ # you can always check `pages.map(&:title)` to see what you've
79
+ # really received; this approach allows you to write absent-minded
80
+ # code like this:
81
+ #
82
+ # ```ruby
83
+ # Infoboxer.wp.get('Argentina', 'Chile', 'Something non-existing').
84
+ # infobox.fetch('some value')
85
+ # ```
86
+ # and obtain meaningful results instead of NoMethodError or some
87
+ # NotFound.
88
+ #
89
+ def get(*titles)
90
+ pages = raw(*titles).reject{|raw| raw[:content].nil?}.
91
+ map{|raw|
92
+ traits = Traits.get(@api_base_url.host, extract_traits(raw))
93
+
94
+ Page.new(self,
95
+ Parser.paragraphs(raw[:content], traits),
96
+ raw.merge(traits: traits))
97
+ }
98
+ titles.count == 1 ? pages.first : Tree::Nodes[*pages]
99
+ end
100
+
101
+ private
102
+
103
+ # @private
104
+ PROP = [
105
+ 'revisions', # to extract content of the page
106
+ 'info', # to extract page canonical url
107
+ 'categories', # to extract default category prefix
108
+ 'images' # to extract default media prefix
109
+ ].join('|')
110
+
111
+ # @private
112
+ DEFAULT_PARAMS = {
113
+ action: :query,
114
+ format: :json,
115
+ redirects: true,
116
+
117
+ prop: PROP,
118
+ rvprop: :content,
119
+ inprop: :url,
120
+ }
121
+
122
+ def headers(options)
123
+ {'User-Agent' => options[:user_agent] || options[:ua] || self.class.user_agent || UA}
124
+ end
125
+
126
+ def extract_traits(raw)
127
+ raw.select{|k, v| [:file_prefix, :category_prefix].include?(k)}
128
+ end
129
+
130
+ def guess_traits(pages)
131
+ categories = pages.map{|p| p['categories']}.compact.flatten
132
+ images = pages.map{|p| p['images']}.compact.flatten
133
+ {
134
+ file_prefix: images.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
135
+ category_prefix: categories.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
136
+ }
137
+ end
138
+
139
+ def postprocess(response)
140
+ pages = JSON.parse(response)['query']['pages']
141
+ traits = guess_traits(pages.values)
142
+
143
+ pages.map{|id, data|
144
+ if id.to_i < 0
145
+ {
146
+ title: data['title'],
147
+ content: nil,
148
+ not_found: true
149
+ }
150
+ else
151
+ {
152
+ title: data['title'],
153
+ content: data['revisions'].first['*'],
154
+ url: data['fullurl'],
155
+ }.merge(traits)
156
+ end
157
+ }
158
+ rescue JSON::ParserError
159
+ fail RuntimeError, "Not a JSON response, seems there's not a MediaWiki API: #{@api_base_url}"
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,38 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class MediaWiki
4
+ # A descendant of {Tree::Document Document}, representing page,
5
+ # received from {MediaWiki} client.
6
+ #
7
+ # Alongside with document tree structure, knows document's title as
8
+ # represented by MediaWiki and human (non-API) URL.
9
+ class Page < Tree::Document
10
+ def initialize(client, children, raw)
11
+ @client = client
12
+ super(children, raw)
13
+ end
14
+
15
+ # Instance of {MediaWiki} which this page was received from
16
+ # @return {MediaWiki}
17
+ attr_reader :client
18
+
19
+ # @!attribute [r] title
20
+ # Page title.
21
+ # @return [String]
22
+
23
+ # @!attribute [r] url
24
+ # Page friendly URL.
25
+ # @return [String]
26
+
27
+ def_readers :title, :url, :traits
28
+
29
+ private
30
+
31
+ PARAMS_TO_INSPECT = [:url, :title, :domain]
32
+
33
+ def show_params
34
+ super(params.select{|k, v| PARAMS_TO_INSPECT.include?(k)})
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class MediaWiki
4
+ class Traits
5
+ class << self
6
+ def templates(&definition)
7
+ @templates ||= Templates::Set.new
8
+
9
+ return @templates unless definition
10
+
11
+ @templates.define(&definition)
12
+ end
13
+
14
+ # NB: explicitly store all domains in base Traits class
15
+ def domain(d)
16
+ Traits.domains.key?(d) and
17
+ fail(ArgumentError, "Domain binding redefinition: #{Traits.domains[d]}")
18
+
19
+ Traits.domains[d] = self
20
+ end
21
+
22
+ def get(domain, options = {})
23
+ cls = Traits.domains[domain]
24
+ cls ? cls.new(options) : Traits.new(options)
25
+ end
26
+
27
+ def domains
28
+ @domains ||= {}
29
+ end
30
+
31
+ def for(domain, &block)
32
+ Class.new(self, &block).domain(domain)
33
+ end
34
+
35
+ alias_method :default, :new
36
+ end
37
+
38
+ DEFAULTS = {
39
+ file_prefix: 'File',
40
+ category_prefix: 'Category'
41
+ }
42
+
43
+ def initialize(options = {})
44
+ @options = options
45
+ @file_prefix = [DEFAULTS[:file_prefix], options.delete(:file_prefix)].
46
+ flatten.compact.uniq
47
+ @category_prefix = [DEFAULTS[:category_prefix], options.delete(:category_prefix)].
48
+ flatten.compact.uniq
49
+ end
50
+
51
+ attr_reader :file_prefix, :category_prefix
52
+
53
+ #attr_accessor :re
54
+
55
+ def templates
56
+ self.class.templates
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,84 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ # Navigation is one of the things Infoboxer is proud about. It tries
4
+ # to be logical, unobtrusive and compact.
5
+ #
6
+ # There's several levels of navigation:
7
+ # * simple tree navigation;
8
+ # * navigational shortcuts;
9
+ # * logical structure navigation (sections).
10
+ #
11
+ # ## Simple tree navigation
12
+ #
13
+ # It's somewhat similar to XPath/CSS selectors you'll use to navigate
14
+ # through HTML DOM. It is represented (and documented) in {Lookup::Node}
15
+ # module. To show you the taste of it:
16
+ #
17
+ # ```ruby
18
+ # document.
19
+ # lookup(:Wikilink, text: /Chile/).
20
+ # lookup_parents(:Table){|t| t.params[:class] == 'wikitable'}.
21
+ # lookup_children(size: 3)
22
+ # ```
23
+ #
24
+ # ## Navigational shortcuts
25
+ #
26
+ # There is nothing too complicated, just pretty shortcuts over `lookup_*`
27
+ # methods, so, you can write just
28
+ #
29
+ # ```ruby
30
+ # document.paragraphs.last.wikilinks('Category')
31
+ # ```
32
+ # ...instead of
33
+ # ```ruby
34
+ # document.lookup(:Paragraph).last.lookup(:Wikilink, namespace: 'Category')
35
+ # ```
36
+ # ...and so on.
37
+ #
38
+ # Look into {Shortcuts::Node} documentation for list of shortcuts.
39
+ #
40
+ # ## Logical structure navigation
41
+ #
42
+ # MediaWiki page structure is flat, like HTML's (there's just sequence
43
+ # of headings and paragraphs). Though, for most tasks of information
44
+ # extraction it is usefult to think of page as a structure of nested
45
+ # sections. {Sections} module provides such ability. It treats document
46
+ # as an {Sections::Container#intro intro} and set of subsequent
47
+ # {Sections::Section section}s of same level, which, in turn, have inside
48
+ # they own intro and sections. Also, each node has
49
+ # {Sections::Node#in_sections #in_sections} method, returning all sections
50
+ # in which it is nested.
51
+ #
52
+ # The code with sections can feel like this:
53
+ #
54
+ # ```ruby
55
+ # page.sections('Culture' => 'Music').tables
56
+ # # or like this
57
+ # page.wikilinks.select{|link| link.in_sections.first.heading.text.include?('Culture')}
58
+ # ```
59
+ #
60
+ # See {Sections::Container} for downwards section navigation, and
61
+ # {Sections::Node} for upwards.
62
+ #
63
+ module Navigation
64
+ %w[lookup shortcuts sections].each do |nav|
65
+ require_relative "navigation/#{nav}"
66
+ end
67
+
68
+ class Tree::Node
69
+ include Navigation::Lookup::Node
70
+ include Navigation::Shortcuts::Node
71
+ include Navigation::Sections::Node
72
+ end
73
+
74
+ class Tree::Nodes
75
+ include Navigation::Lookup::Nodes
76
+ include Navigation::Shortcuts::Nodes
77
+ include Navigation::Sections::Nodes
78
+ end
79
+
80
+ class Tree::Document
81
+ include Navigation::Sections::Container
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,216 @@
1
+ # encoding: utf-8
2
+ require_relative 'selector'
3
+
4
+ module Infoboxer
5
+ module Navigation
6
+ # See {Lookup::Node Lookup::Node} for everything!
7
+ module Lookup
8
+ # `Lookup::Node` module provides methods for navigating through
9
+ # page tree in XPath-like manner.
10
+ #
11
+ # What you need to know about it:
12
+ #
13
+ # ## Selectors
14
+ #
15
+ # Each `lookup_*` method (and others similar) receive
16
+ # _list of selectors_. Examples of acceptable selectors:
17
+ #
18
+ # ```ruby
19
+ # # 1. Node class:
20
+ # document.lookup(Bold) # all Bolds
21
+ #
22
+ # # 2. Class symbol
23
+ # document.lookup(:Bold)
24
+ # # same as above, useful if you don't want to include Infoboxer::Tree
25
+ # # in all of your code or write things like lookup(Infoboxer::Tree::Bold)
26
+ #
27
+ # # 3. Getter/pattern:
28
+ # document.lookup(text: /something/)
29
+ # # finds all nodes where result of getter matches pattern
30
+ #
31
+ # # Checks against patterns are performed with `===`, so you can
32
+ # # use regexps to find by text, or ranges to find by number, like
33
+ # document.lookup(:Heading, level: (3..4))
34
+ #
35
+ # # Nodes where method is not defined are ignored, so you can
36
+ # # rewrite above example as just
37
+ # document.lookup(level: 3..4)
38
+ # # ...and receive meaningful result without any NoMethodError
39
+ #
40
+ # # 4. Check symbol
41
+ # document.lookup(:bold?)
42
+ # # finds all nodes for which `:bold?` is defined and returns
43
+ # # truthy value;
44
+ #
45
+ # # 5. Code block
46
+ # document.lookup{|node| node.params.has_key?(:class)}
47
+ # ```
48
+ #
49
+ # You also can use any of those method without **any** selector,
50
+ # thus receiving ALL parents, ALL children, ALL siblings and so on.
51
+ #
52
+ # ## Chainable navigation
53
+ #
54
+ # Each `lookup_*` method returns an instance of {Tree::Nodes} class,
55
+ # which behaves like an Array, but also defines similar set of
56
+ # `lookup_*` methods, so, you can brainlessly do the things like
57
+ #
58
+ # ```ruby
59
+ # document.
60
+ # lookup(:Paragraph){|p| p.text.length > 100}.
61
+ # lookup(:Wikilink, text: /^List of/).
62
+ # select(&:bold?)
63
+ # ```
64
+ #
65
+ # ## Underscored methods
66
+ #
67
+ # For all methods of this module you can notice "underscored" version
68
+ # (`lookup_children` vs `_lookup_children` and so on). Basically,
69
+ # underscored versions accept instance of {Lookup::Selector}, which
70
+ # is already preprocessed version of all selectors. It is kinda
71
+ # internal thing, though can be useful if you store selectors in
72
+ # variables -- it is easier to have and use just one instance of
73
+ # Selector, than list of arguments and blocks.
74
+ #
75
+ module Node
76
+ # @!method matches?(*selectors, &block)
77
+ # Checks if current node matches selectors.
78
+
79
+ # @!method lookup(*selectors, &block)
80
+ # Selects matching nodes from entire subtree inside current node.
81
+
82
+ # @!method lookup_children(*selectors, &block)
83
+ # Selects nodes only from this node's direct children.
84
+
85
+ # @!method lookup_parents(*selectors, &block)
86
+ # Selects matching nodes of this node's parents chain, up to
87
+ # entire {Tree::Document Document}.
88
+
89
+ # @!method lookup_siblings(*selectors, &block)
90
+ # Selects matching nodes from current node's siblings.
91
+
92
+ # @!method lookup_next_siblings(*selectors, &block)
93
+ # Selects matching nodes from current node's siblings, which
94
+ # are below current node in parents children list.
95
+
96
+ # @!method lookup_prev_siblings(*selectors, &block)
97
+ # Selects matching nodes from current node's siblings, which
98
+ # are above current node in parents children list.
99
+
100
+ # Underscored version of {#matches?}
101
+ def _matches?(selector)
102
+ selector.matches?(self)
103
+ end
104
+
105
+ # Underscored version of {#lookup}
106
+ def _lookup(selector)
107
+ Tree::Nodes[_matches?(selector) ? self : nil, *children._lookup(selector)].
108
+ flatten.compact
109
+ end
110
+
111
+ # Underscored version of {#lookup_children}
112
+ def _lookup_children(selector)
113
+ @children._find(selector)
114
+ end
115
+
116
+ # Underscored version of {#lookup_parents}
117
+ def _lookup_parents(selector)
118
+ case
119
+ when !parent
120
+ Tree::Nodes[]
121
+ when parent._matches?(selector)
122
+ Tree::Nodes[parent, *parent._lookup_parents(selector)]
123
+ else
124
+ parent._lookup_parents(selector)
125
+ end
126
+ end
127
+
128
+ # Underscored version of {#lookup_siblings}
129
+ def _lookup_siblings(selector)
130
+ siblings._find(selector)
131
+ end
132
+
133
+ # Underscored version of {#lookup_prev_siblings}
134
+ def _lookup_prev_siblings(selector)
135
+ prev_siblings._find(selector)
136
+ end
137
+
138
+ # Underscored version of {#lookup_next_siblings}
139
+ def _lookup_next_siblings(selector)
140
+ next_siblings._find(selector)
141
+ end
142
+
143
+ [:matches?,
144
+ :lookup, :lookup_children, :lookup_parents,
145
+ :lookup_siblings,
146
+ :lookup_next_siblings, :lookup_prev_siblings
147
+ ].map{|sym| [sym, :"_#{sym}"]}.each do |sym, underscored|
148
+
149
+ define_method(sym){|*args, &block|
150
+ send(underscored, Selector.new(*args, &block))
151
+ }
152
+ end
153
+
154
+ # Checks if node has any parent matching selectors.
155
+ def has_parent?(*selectors, &block)
156
+ !lookup_parents(*selectors, &block).empty?
157
+ end
158
+ end
159
+
160
+ # This module provides implementations for all `lookup_*` methods
161
+ # of {Lookup::Node} for be used on nodes list. Note, that all
162
+ # those methods return _flat_ list of results (so, if you have
163
+ # found several nodes, and then look for their siblings, you should
164
+ # not expect array of arrays -- just one array of nodes).
165
+ #
166
+ # See {Lookup::Node} for detailed lookups and selectors explanation.
167
+ module Nodes
168
+ # @!method lookup(*selectors, &block)
169
+ # @!method lookup_children(*selectors, &block)
170
+ # @!method lookup_parents(*selectors, &block)
171
+ # @!method lookup_siblings(*selectors, &block)
172
+ # @!method lookup_next_siblings(*selectors, &block)
173
+ # @!method lookup_prev_siblings(*selectors, &block)
174
+
175
+ # @!method _lookup(selector)
176
+ # @!method _lookup_children(selector)
177
+ # @!method _lookup_parents(selector)
178
+ # @!method _lookup_siblings(selector)
179
+ # @!method _lookup_next_siblings(selector)
180
+ # @!method _lookup_prev_siblings(selector)
181
+
182
+ # Underscored version of {#find}.
183
+ def _find(selector)
184
+ select{|n| n._matches?(selector)}
185
+ end
186
+
187
+ # Selects nodes of current list (and only it, no children checks),
188
+ # which are matching selectors.
189
+ def find(*selectors, &block)
190
+ _find(Selector.new(*selectors, &block))
191
+ end
192
+
193
+ [
194
+ :_lookup, :_lookup_children, :_lookup_parents,
195
+ :_lookup_siblings, :_lookup_prev_siblings, :_lookup_next_siblings
196
+ ].each do |sym|
197
+ define_method(sym){|*args|
198
+ make_nodes map{|n| n.send(sym, *args)}
199
+ }
200
+ end
201
+
202
+ # not delegate, but redefine: Selector should be constructed only once
203
+ [
204
+ :lookup, :lookup_children, :lookup_parents,
205
+ :lookup_siblings,
206
+ :lookup_next_siblings, :lookup_prev_siblings
207
+ ].map{|sym| [sym, :"_#{sym}"]}.each do |sym, underscored|
208
+
209
+ define_method(sym){|*args, &block|
210
+ send(underscored, Selector.new(*args, &block))
211
+ }
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end