wikidata-fetcher 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e3918bdb5435b10a4a3c1f9ff68b5a4c8102e87
4
+ data.tar.gz: 271b48ba804f121f56f5b3ff35e7bdf849d20b07
5
+ SHA512:
6
+ metadata.gz: e6926e4a68c676d25541fb0635b7a5337ab3e874e98c29b290e06bc60c998b3e0ecf46536ba3de3b611070b39460102a8dd6ea3ebdeec6b1586fa60b4aaff7af
7
+ data.tar.gz: c6aced9d0992480c08bcee60be1a1dcd5c47b385282d7554008ddfbc8251174f80b114902433aa953e1fdd0511a921a51b046fd85d5cd2308192973c3689b1b8
@@ -0,0 +1,27 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
23
+
24
+ *.swp
25
+
26
+ *.cache
27
+ .rubocop-https---raw-githubusercontent-com-everypolitician-everypolitician-data-master--rubocop-base-yml
@@ -0,0 +1,21 @@
1
+ AllCops:
2
+ Exclude:
3
+ - 'Vagrantfile'
4
+ - 'vendor/**/*'
5
+ TargetRubyVersion: 2.0
6
+
7
+ inherit_from:
8
+ - https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
9
+ - .rubocop_todo.yml
10
+
11
+ Style/AndOr:
12
+ Exclude:
13
+ - 'lib/wikidata/fetcher.rb'
14
+
15
+ # depsite Rubocop's insistence, we can't replace .find_all with .select
16
+ # everywhere, as they do very different things on a Hash
17
+ # http://stackoverflow.com/questions/20999192/is-find-all-and-select-the-same-thing/21000136#21000136
18
+ Style/CollectionMethods:
19
+ Exclude:
20
+ - 'lib/wikidata/category.rb'
21
+
@@ -0,0 +1,24 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2017-02-11 17:57:53 +0000 using RuboCop version 0.42.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 6
10
+ Metrics/AbcSize:
11
+ Max: 38
12
+
13
+ # Offense count: 3
14
+ Metrics/CyclomaticComplexity:
15
+ Max: 8
16
+
17
+ # Offense count: 5
18
+ # Configuration parameters: CountComments.
19
+ Metrics/MethodLength:
20
+ Max: 25
21
+
22
+ # Offense count: 2
23
+ Metrics/PerceivedComplexity:
24
+ Max: 8
@@ -0,0 +1,10 @@
1
+ sudo: false
2
+ language: ruby
3
+ cache: bundler
4
+ before_install:
5
+ - gem update --system --no-doc
6
+ - gem install bundler -v 1.14.1
7
+ rvm:
8
+ - 2.3.1
9
+ - 2.2
10
+ - 2.1
data/CHANGES ADDED
@@ -0,0 +1,67 @@
1
+ O.19.0 2016-11-26
2
+ - Add new EveryPolitician::Wikidata.sparql method for running SPARQL
3
+ queries
4
+
5
+ O.18.0 2016-10-01
6
+ - We no longer cache MediaWiki API query results. If you need to do
7
+ something like that, you should be doing it locally.
8
+
9
+ O.17.0 2016-06-17
10
+ - Can now specify an optional `table` parameter when fetching data
11
+ from a Morph scraper
12
+
13
+ O.16.0 2016-06-14
14
+ - Better handling of errors from Wikisnakker
15
+
16
+ O.15.0 2016-06-05
17
+ - Delete existing data before re-scraping
18
+
19
+ O.14.0 2016-03-27
20
+ - Add a `subcategories` method on Category
21
+ - Move the `last_seen` to a different table
22
+
23
+ O.13.0 2016-03-26
24
+ - Report on missing Wikidata IDs
25
+ - Add a `last_seen` column
26
+
27
+ O.12.0 2016-03-25
28
+ - Add a `batch_size` option
29
+
30
+ O.11.0 2016-03-25
31
+ - Move the list of want/skip properties to external file
32
+
33
+ O.10.0 2016-02-06
34
+ - Remove bracketed sections at end of names
35
+ - Give details of which article has the unknown known
36
+
37
+ O.9.0 2016-02-02
38
+ - Add WDQ lookup method
39
+ - Fetch data by IDs as well as names
40
+
41
+ O.8.0 2016-01-12
42
+ - Fetch data in bulk where possible
43
+ - Add 'before' and 'after' options to XPath-finder
44
+
45
+ 0.7.2 2015-12-31
46
+ - Add some Scraper helper functions in EveryPolitician::Wikidata
47
+ - Split out a function for Category member names
48
+ - Additional properties
49
+
50
+ 0.7.0 2015-12-31
51
+
52
+ 0.6.4 2015-11-28
53
+ - Austrian Parliament ID property
54
+
55
+ 0.6.0 2015-10-04
56
+ - return the original pagename where redirected
57
+
58
+ 0.5.0 2015-09-23
59
+ - fetch _all_ names and wikipedia links, not just in the requested
60
+ languages
61
+
62
+ 0.4.1 2015-09-23
63
+ - Additional properties
64
+
65
+ 0.4.0 2015-09-12
66
+ - resolve redirects automatically
67
+
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'wikisnakker', git: 'https://github.com/everypolitician/wikisnakker', branch: 'master'
4
+
5
+ # Specify your gem's dependencies in wikidata-fetcher.gemspec
6
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Tony Bowden
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,68 @@
1
+ # Wikidata::Fetcher
2
+
3
+ Fetch information useful to EveryPolitician from Wikidata
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem "wikidata-fetcher", git: "https://github.com/everypolitician/wikidata-fetcher.git"
10
+
11
+ ## Usage
12
+
13
+ ```
14
+ require 'wikidata/fetcher'
15
+
16
+ #------------------------------------------
17
+ # Step 1: Get a list of Wikipedia pagenames
18
+ #------------------------------------------
19
+
20
+ # from a Wikipedia page, by XPath
21
+ en_names = EveryPolitician::Wikidata.wikipedia_xpath(
22
+ url: 'https://en.wikipedia.org/wiki/Template:Peruvian_Congress_2011-2016',
23
+ xpath: '//table//td[contains(@class,"navbox-list")]//li//a[not(@class="new")]/@title',
24
+ )
25
+
26
+ # or from a Wikipedia Category
27
+ es_names = WikiData::Category.new( 'Categoría:Congresistas de Perú 2011-2016', 'es').member_titles
28
+
29
+ # or from a Morph scraper
30
+ names = EveryPolitician::Wikidata.morph_wikinames(source: 'tmtmtmtm/tuvalu-parliament-wikipedia', column: 'wikiname')
31
+
32
+ # or from a SPARQL query
33
+ ids = EveryPolitician::Wikidata.sparql('SELECT ?item WHERE { ?item wdt:P39 wd:Q18229570 . }')
34
+
35
+ # or from a WDQ query
36
+ ids = EveryPolitician::Wikidata.wdq('claim[463:21124329]')
37
+
38
+ #-----------------------------------------------------------
39
+ # Step 2: Scrape the data from Wikidata based on these names
40
+ #-----------------------------------------------------------
41
+
42
+ EveryPolitician::Wikidata.scrape_wikidata(names: { en: names })
43
+
44
+ # NB: this can take multiple lists, and can also output the data as it fetches it:
45
+
46
+ EveryPolitician::Wikidata.scrape_wikidata(names: {
47
+ es: es_names,
48
+ en: en_names,
49
+ }, output: true)
50
+
51
+ #-----------------------------
52
+ # Step 3: Notify the Rebuilder
53
+ #-----------------------------
54
+
55
+ EveryPolitician::Wikidata.notify_rebuilder
56
+
57
+ (This requires MORPH_REBUILDER_URL to be set in the environment)
58
+
59
+ ```
60
+
61
+
62
+ ## Contributing
63
+
64
+ 1. Fork it ( https://github.com/everypolitician/wikidata-fetcher/fork )
65
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
66
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
67
+ 4. Push to the branch (`git push origin my-new-feature`)
68
+ 5. Create a new Pull Request
@@ -0,0 +1,14 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rake/testtask'
4
+ require 'rubocop/rake_task'
5
+
6
+ RuboCop::RakeTask.new
7
+
8
+ Rake::TestTask.new(:test) do |t|
9
+ t.libs << 't'
10
+ t.libs << 'lib'
11
+ t.test_files = FileList['t/*.rb']
12
+ end
13
+
14
+ task default: %i(test rubocop)
@@ -0,0 +1,140 @@
1
+ require 'colorize'
2
+ require 'digest/sha1'
3
+ require 'json'
4
+ require 'mediawiki_api'
5
+ require 'require_all'
6
+ require 'wikisnakker'
7
+
8
+ require_rel '.'
9
+
10
+ class WikiData
11
+ def self.ids_from_pages(lang, titles)
12
+ client = MediawikiApi::Client.new "https://#{lang}.wikipedia.org/w/api.php"
13
+ res = titles.compact.each_slice(50).map do |sliced|
14
+ page_args = {
15
+ prop: 'pageprops',
16
+ ppprop: 'wikibase_item',
17
+ redirects: 1,
18
+ titles: sliced.join('|'),
19
+ token_type: false,
20
+ }
21
+ response = client.action :query, page_args
22
+ redirected_from = Hash[(response.data['redirects'] || []).map { |h| [h['to'], h['from']] }]
23
+ response.data['pages'].select { |_k, v| v.key? 'pageprops' }.map do |_k, v|
24
+ [redirected_from[v['title']] || v['title'], v['pageprops']['wikibase_item']]
25
+ end
26
+ end
27
+ results = Hash[res.flatten(1)]
28
+ missing = titles - results.keys
29
+ warn "Can't find Wikidata IDs for: #{missing.join(', ')} in #{lang}" if missing.any?
30
+ results
31
+ end
32
+ end
33
+
34
+ module EveryPolitician
35
+ module Wikidata
36
+ WDQ_URL = 'https://wdq.wmflabs.org/api'.freeze
37
+
38
+ def self.wdq(query)
39
+ result = RestClient.get WDQ_URL, params: { q: query }
40
+ json = JSON.parse(result, symbolize_names: true)
41
+ json[:items].map { |id| "Q#{id}" }
42
+ end
43
+
44
+ WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql'.freeze
45
+
46
+ def self.sparql(query)
47
+ result = RestClient.get WIKIDATA_SPARQL_URL, params: { query: query, format: 'json' }
48
+ json = JSON.parse(result, symbolize_names: true)
49
+ json[:results][:bindings].map { |res| res[:item][:value].split('/').last }
50
+ rescue RestClient::Exception => e
51
+ raise "Wikidata query #{query} failed: #{e.message}"
52
+ end
53
+
54
+ require 'rest-client'
55
+
56
+ def self.morph_wikinames(h)
57
+ morph_api_url = 'https://api.morph.io/%s/data.json' % h[:source]
58
+ morph_api_key = ENV['MORPH_API_KEY']
59
+ table = h[:table] || 'data'
60
+ result = RestClient.get morph_api_url, params: {
61
+ key: morph_api_key,
62
+ query: "SELECT DISTINCT(#{h[:column]}) AS wikiname FROM #{table}",
63
+ }
64
+ JSON.parse(result, symbolize_names: true).map { |e| e[:wikiname] }.reject { |n| n.to_s.empty? }
65
+ end
66
+
67
+ require 'pry'
68
+ def self.wikipedia_xpath(h)
69
+ noko = noko_for(URI.decode(h[:url]))
70
+
71
+ if h[:after]
72
+ point = noko.xpath(h[:after])
73
+ raise "Can't find #{h[:after]}" if point.empty?
74
+ point.xpath('.//preceding::*').remove
75
+ end
76
+
77
+ if h[:before]
78
+ point = noko.xpath(h[:before])
79
+ raise "Can't find #{h[:before]}" if point.empty?
80
+ point.xpath('.//following::*').remove
81
+ end
82
+
83
+ names = noko.xpath(h[:xpath]).map(&:text).uniq
84
+ binding.pry if h[:debug] == true
85
+ raise "No names found in #{h[:url]}" if names.count.zero?
86
+ names
87
+ end
88
+
89
+ require 'open-uri'
90
+ require 'nokogiri'
91
+
92
+ def self.noko_for(url)
93
+ Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read)
94
+ end
95
+
96
+ #-------------------------------------------------------------------
97
+
98
+ require 'scraperwiki'
99
+
100
+ def self.scrape_wikidata(h)
101
+ langs = ((h[:lang] || (h[:names] ||= {}).keys) + [:en]).flatten.uniq
102
+ langpairs = h[:names].map { |lang, names| WikiData.ids_from_pages(lang.to_s, names) }
103
+ combined = langpairs.reduce({}) { |a, e| a.merge(e.invert) }
104
+ (h[:ids] ||= []).each { |id| combined[id] ||= nil }
105
+ # Clean out existing data
106
+ ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
107
+
108
+ Hash[combined.to_a.shuffle].each_slice(h[:batch_size] || 10_000) do |slice|
109
+ sliced = Hash[slice]
110
+ found = WikiData::Fetcher.find(sliced.keys)
111
+ sliced.each do |id, name|
112
+ unless found[id]
113
+ warn "No data for #{id}"
114
+ next
115
+ end
116
+
117
+ begin
118
+ data = found[id].data(langs)
119
+ rescue StandardError => e
120
+ warn "Problem with #{id}: #{e}"
121
+ next
122
+ end
123
+ next unless data
124
+
125
+ data[:original_wikiname] = name
126
+ puts data if h[:output] == true
127
+ ScraperWiki.save_sqlite([:id], data)
128
+ end
129
+ end
130
+ end
131
+
132
+ #-------------------------------------------------------------------
133
+
134
+ require 'rest-client'
135
+
136
+ def self.notify_rebuilder
137
+ RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,62 @@
1
+ require 'mediawiki_api'
2
+
3
+ class WikiData
4
+ class Category < WikiData
5
+ def initialize(page, lang = 'en')
6
+ @_page = page
7
+ @_lang = lang
8
+ end
9
+
10
+ def client
11
+ @_client ||= MediawikiApi::Client.new "https://#{@_lang}.wikipedia.org/w/api.php"
12
+ end
13
+
14
+ def _categorymembers_search(args = {})
15
+ cat_args = {
16
+ cmtitle: @_page,
17
+ token_type: false,
18
+ list: 'categorymembers',
19
+ cmlimit: '500',
20
+ }.merge(args)
21
+ client.action :query, cat_args
22
+ end
23
+
24
+ def members
25
+ search = _categorymembers_search
26
+ all = search.data['categorymembers']
27
+ while search['continue']
28
+ search = _categorymembers_search(cmcontinue: search['continue']['cmcontinue'])
29
+ all << search.data['categorymembers']
30
+ end
31
+ all.flatten.select { |m| (m['ns']).zero? }
32
+ end
33
+
34
+ def subcategories
35
+ search = _categorymembers_search
36
+ all = search.data['categorymembers']
37
+ all.flatten.select { |m| m['ns'] == 14 }.map { |m| m['title'] }
38
+ end
39
+
40
+ def member_ids
41
+ members.map { |m| m['pageid'] }.sort
42
+ end
43
+
44
+ def member_titles
45
+ members.map { |m| m['title'] }.sort
46
+ end
47
+
48
+ def wikidata_ids
49
+ member_ids.compact.each_slice(50).map do |ids|
50
+ page_args = {
51
+ prop: 'pageprops',
52
+ ppprop: 'wikibase_item',
53
+ redirects: 1,
54
+ pageids: ids.join('|'),
55
+ token_type: false,
56
+ }
57
+ response = client.action :query, page_args
58
+ response.data['pages'].find_all { |p| p.last.key? 'pageprops' }.map { |p| p.last['pageprops']['wikibase_item'] }
59
+ end.flatten
60
+ end
61
+ end
62
+ end