wikidata-fetcher 0.19.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e3918bdb5435b10a4a3c1f9ff68b5a4c8102e87
4
+ data.tar.gz: 271b48ba804f121f56f5b3ff35e7bdf849d20b07
5
+ SHA512:
6
+ metadata.gz: e6926e4a68c676d25541fb0635b7a5337ab3e874e98c29b290e06bc60c998b3e0ecf46536ba3de3b611070b39460102a8dd6ea3ebdeec6b1586fa60b4aaff7af
7
+ data.tar.gz: c6aced9d0992480c08bcee60be1a1dcd5c47b385282d7554008ddfbc8251174f80b114902433aa953e1fdd0511a921a51b046fd85d5cd2308192973c3689b1b8
@@ -0,0 +1,27 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
23
+
24
+ *.swp
25
+
26
+ *.cache
27
+ .rubocop-https---raw-githubusercontent-com-everypolitician-everypolitician-data-master--rubocop-base-yml
@@ -0,0 +1,21 @@
1
+ AllCops:
2
+ Exclude:
3
+ - 'Vagrantfile'
4
+ - 'vendor/**/*'
5
+ TargetRubyVersion: 2.0
6
+
7
+ inherit_from:
8
+ - https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
9
+ - .rubocop_todo.yml
10
+
11
+ Style/AndOr:
12
+ Exclude:
13
+ - 'lib/wikidata/fetcher.rb'
14
+
15
+ # depsite Rubocop's insistence, we can't replace .find_all with .select
16
+ # everywhere, as they do very different things on a Hash
17
+ # http://stackoverflow.com/questions/20999192/is-find-all-and-select-the-same-thing/21000136#21000136
18
+ Style/CollectionMethods:
19
+ Exclude:
20
+ - 'lib/wikidata/category.rb'
21
+
@@ -0,0 +1,24 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2017-02-11 17:57:53 +0000 using RuboCop version 0.42.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 6
10
+ Metrics/AbcSize:
11
+ Max: 38
12
+
13
+ # Offense count: 3
14
+ Metrics/CyclomaticComplexity:
15
+ Max: 8
16
+
17
+ # Offense count: 5
18
+ # Configuration parameters: CountComments.
19
+ Metrics/MethodLength:
20
+ Max: 25
21
+
22
+ # Offense count: 2
23
+ Metrics/PerceivedComplexity:
24
+ Max: 8
@@ -0,0 +1,10 @@
1
+ sudo: false
2
+ language: ruby
3
+ cache: bundler
4
+ before_install:
5
+ - gem update --system --no-doc
6
+ - gem install bundler -v 1.14.1
7
+ rvm:
8
+ - 2.3.1
9
+ - 2.2
10
+ - 2.1
data/CHANGES ADDED
@@ -0,0 +1,67 @@
1
+ O.19.0 2016-11-26
2
+ - Add new EveryPolitician::Wikidata.sparql method for running SPARQL
3
+ queries
4
+
5
+ O.18.0 2016-10-01
6
+ - We no longer cache MediaWiki API query results. If you need to do
7
+ something like that, you should be doing it locally.
8
+
9
+ O.17.0 2016-06-17
10
+ - Can now specify an optional `table` parameter when fetching data
11
+ from a Morph scraper
12
+
13
+ O.16.0 2016-06-14
14
+ - Better handling of errors from Wikisnakker
15
+
16
+ O.15.0 2016-06-05
17
+ - Delete existing data before re-scraping
18
+
19
+ O.14.0 2016-03-27
20
+ - Add a `subcategories` method on Category
21
+ - Move the `last_seen` to a different table
22
+
23
+ O.13.0 2016-03-26
24
+ - Report on missing Wikidata IDs
25
+ - Add a `last_seen` column
26
+
27
+ O.12.0 2016-03-25
28
+ - Add a `batch_size` option
29
+
30
+ O.11.0 2016-03-25
31
+ - Move the list of want/skip properties to external file
32
+
33
+ O.10.0 2016-02-06
34
+ - Remove bracketed sections at end of names
35
+ - Give details of which article has the unknown known
36
+
37
+ O.9.0 2016-02-02
38
+ - Add WDQ lookup method
39
+ - Fetch data by IDs as well as names
40
+
41
+ O.8.0 2016-01-12
42
+ - Fetch data in bulk where possible
43
+ - Add 'before' and 'after' options to XPath-finder
44
+
45
+ 0.7.2 2015-12-31
46
+ - Add some Scraper helper functions in EveryPolitician::Wikidata
47
+ - Split out a function for Category member names
48
+ - Additional properties
49
+
50
+ 0.7.0 2015-12-31
51
+
52
+ 0.6.4 2015-11-28
53
+ - Austrian Parliament ID property
54
+
55
+ 0.6.0 2015-10-04
56
+ - return the original pagename where redirected
57
+
58
+ 0.5.0 2015-09-23
59
+ - fetch _all_ names and wikipedia links, not just in the requested
60
+ languages
61
+
62
+ 0.4.1 2015-09-23
63
+ - Additional properties
64
+
65
+ 0.4.0 2015-09-12
66
+ - resolve redirects automatically
67
+
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'wikisnakker', git: 'https://github.com/everypolitician/wikisnakker', branch: 'master'
4
+
5
+ # Specify your gem's dependencies in wikidata-fetcher.gemspec
6
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Tony Bowden
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,68 @@
1
+ # Wikidata::Fetcher
2
+
3
+ Fetch information useful to EveryPolitician from Wikidata
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem "wikidata-fetcher", git: "https://github.com/everypolitician/wikidata-fetcher.git"
10
+
11
+ ## Usage
12
+
13
+ ```
14
+ require 'wikidata/fetcher'
15
+
16
+ #------------------------------------------
17
+ # Step 1: Get a list of Wikipedia pagenames
18
+ #------------------------------------------
19
+
20
+ # from a Wikipedia page, by XPath
21
+ en_names = EveryPolitician::Wikidata.wikipedia_xpath(
22
+ url: 'https://en.wikipedia.org/wiki/Template:Peruvian_Congress_2011-2016',
23
+ xpath: '//table//td[contains(@class,"navbox-list")]//li//a[not(@class="new")]/@title',
24
+ )
25
+
26
+ # or from a Wikipedia Category
27
+ es_names = WikiData::Category.new( 'Categoría:Congresistas de Perú 2011-2016', 'es').member_titles
28
+
29
+ # or from a Morph scraper
30
+ names = EveryPolitician::Wikidata.morph_wikinames(source: 'tmtmtmtm/tuvalu-parliament-wikipedia', column: 'wikiname')
31
+
32
+ # or from a SPARQL query
33
+ ids = EveryPolitician::Wikidata.sparql('SELECT ?item WHERE { ?item wdt:P39 wd:Q18229570 . }')
34
+
35
+ # or from a WDQ query
36
+ ids = EveryPolitician::Wikidata.wdq('claim[463:21124329]')
37
+
38
+ #-----------------------------------------------------------
39
+ # Step 2: Scrape the data from Wikidata based on these names
40
+ #-----------------------------------------------------------
41
+
42
+ EveryPolitician::Wikidata.scrape_wikidata(names: { en: names })
43
+
44
+ # NB: this can take multiple lists, and can also output the data as it fetches it:
45
+
46
+ EveryPolitician::Wikidata.scrape_wikidata(names: {
47
+ es: es_names,
48
+ en: en_names,
49
+ }, output: true)
50
+
51
+ #-----------------------------
52
+ # Step 3: Notify the Rebuilder
53
+ #-----------------------------
54
+
55
+ EveryPolitician::Wikidata.notify_rebuilder
56
+
57
+ (This requires MORPH_REBUILDER_URL to be set in the environment)
58
+
59
+ ```
60
+
61
+
62
+ ## Contributing
63
+
64
+ 1. Fork it ( https://github.com/everypolitician/wikidata-fetcher/fork )
65
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
66
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
67
+ 4. Push to the branch (`git push origin my-new-feature`)
68
+ 5. Create a new Pull Request
@@ -0,0 +1,14 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rake/testtask'
4
+ require 'rubocop/rake_task'
5
+
6
+ RuboCop::RakeTask.new
7
+
8
+ Rake::TestTask.new(:test) do |t|
9
+ t.libs << 't'
10
+ t.libs << 'lib'
11
+ t.test_files = FileList['t/*.rb']
12
+ end
13
+
14
+ task default: %i(test rubocop)
@@ -0,0 +1,140 @@
1
+ require 'colorize'
2
+ require 'digest/sha1'
3
+ require 'json'
4
+ require 'mediawiki_api'
5
+ require 'require_all'
6
+ require 'wikisnakker'
7
+
8
+ require_rel '.'
9
+
10
+ class WikiData
11
+ def self.ids_from_pages(lang, titles)
12
+ client = MediawikiApi::Client.new "https://#{lang}.wikipedia.org/w/api.php"
13
+ res = titles.compact.each_slice(50).map do |sliced|
14
+ page_args = {
15
+ prop: 'pageprops',
16
+ ppprop: 'wikibase_item',
17
+ redirects: 1,
18
+ titles: sliced.join('|'),
19
+ token_type: false,
20
+ }
21
+ response = client.action :query, page_args
22
+ redirected_from = Hash[(response.data['redirects'] || []).map { |h| [h['to'], h['from']] }]
23
+ response.data['pages'].select { |_k, v| v.key? 'pageprops' }.map do |_k, v|
24
+ [redirected_from[v['title']] || v['title'], v['pageprops']['wikibase_item']]
25
+ end
26
+ end
27
+ results = Hash[res.flatten(1)]
28
+ missing = titles - results.keys
29
+ warn "Can't find Wikidata IDs for: #{missing.join(', ')} in #{lang}" if missing.any?
30
+ results
31
+ end
32
+ end
33
+
34
+ module EveryPolitician
35
+ module Wikidata
36
+ WDQ_URL = 'https://wdq.wmflabs.org/api'.freeze
37
+
38
+ def self.wdq(query)
39
+ result = RestClient.get WDQ_URL, params: { q: query }
40
+ json = JSON.parse(result, symbolize_names: true)
41
+ json[:items].map { |id| "Q#{id}" }
42
+ end
43
+
44
+ WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql'.freeze
45
+
46
+ def self.sparql(query)
47
+ result = RestClient.get WIKIDATA_SPARQL_URL, params: { query: query, format: 'json' }
48
+ json = JSON.parse(result, symbolize_names: true)
49
+ json[:results][:bindings].map { |res| res[:item][:value].split('/').last }
50
+ rescue RestClient::Exception => e
51
+ raise "Wikidata query #{query} failed: #{e.message}"
52
+ end
53
+
54
+ require 'rest-client'
55
+
56
+ def self.morph_wikinames(h)
57
+ morph_api_url = 'https://api.morph.io/%s/data.json' % h[:source]
58
+ morph_api_key = ENV['MORPH_API_KEY']
59
+ table = h[:table] || 'data'
60
+ result = RestClient.get morph_api_url, params: {
61
+ key: morph_api_key,
62
+ query: "SELECT DISTINCT(#{h[:column]}) AS wikiname FROM #{table}",
63
+ }
64
+ JSON.parse(result, symbolize_names: true).map { |e| e[:wikiname] }.reject { |n| n.to_s.empty? }
65
+ end
66
+
67
+ require 'pry'
68
+ def self.wikipedia_xpath(h)
69
+ noko = noko_for(URI.decode(h[:url]))
70
+
71
+ if h[:after]
72
+ point = noko.xpath(h[:after])
73
+ raise "Can't find #{h[:after]}" if point.empty?
74
+ point.xpath('.//preceding::*').remove
75
+ end
76
+
77
+ if h[:before]
78
+ point = noko.xpath(h[:before])
79
+ raise "Can't find #{h[:before]}" if point.empty?
80
+ point.xpath('.//following::*').remove
81
+ end
82
+
83
+ names = noko.xpath(h[:xpath]).map(&:text).uniq
84
+ binding.pry if h[:debug] == true
85
+ raise "No names found in #{h[:url]}" if names.count.zero?
86
+ names
87
+ end
88
+
89
+ require 'open-uri'
90
+ require 'nokogiri'
91
+
92
+ def self.noko_for(url)
93
+ Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read)
94
+ end
95
+
96
+ #-------------------------------------------------------------------
97
+
98
+ require 'scraperwiki'
99
+
100
+ def self.scrape_wikidata(h)
101
+ langs = ((h[:lang] || (h[:names] ||= {}).keys) + [:en]).flatten.uniq
102
+ langpairs = h[:names].map { |lang, names| WikiData.ids_from_pages(lang.to_s, names) }
103
+ combined = langpairs.reduce({}) { |a, e| a.merge(e.invert) }
104
+ (h[:ids] ||= []).each { |id| combined[id] ||= nil }
105
+ # Clean out existing data
106
+ ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
107
+
108
+ Hash[combined.to_a.shuffle].each_slice(h[:batch_size] || 10_000) do |slice|
109
+ sliced = Hash[slice]
110
+ found = WikiData::Fetcher.find(sliced.keys)
111
+ sliced.each do |id, name|
112
+ unless found[id]
113
+ warn "No data for #{id}"
114
+ next
115
+ end
116
+
117
+ begin
118
+ data = found[id].data(langs)
119
+ rescue StandardError => e
120
+ warn "Problem with #{id}: #{e}"
121
+ next
122
+ end
123
+ next unless data
124
+
125
+ data[:original_wikiname] = name
126
+ puts data if h[:output] == true
127
+ ScraperWiki.save_sqlite([:id], data)
128
+ end
129
+ end
130
+ end
131
+
132
+ #-------------------------------------------------------------------
133
+
134
+ require 'rest-client'
135
+
136
+ def self.notify_rebuilder
137
+ RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,62 @@
1
+ require 'mediawiki_api'
2
+
3
+ class WikiData
4
+ class Category < WikiData
5
+ def initialize(page, lang = 'en')
6
+ @_page = page
7
+ @_lang = lang
8
+ end
9
+
10
+ def client
11
+ @_client ||= MediawikiApi::Client.new "https://#{@_lang}.wikipedia.org/w/api.php"
12
+ end
13
+
14
+ def _categorymembers_search(args = {})
15
+ cat_args = {
16
+ cmtitle: @_page,
17
+ token_type: false,
18
+ list: 'categorymembers',
19
+ cmlimit: '500',
20
+ }.merge(args)
21
+ client.action :query, cat_args
22
+ end
23
+
24
+ def members
25
+ search = _categorymembers_search
26
+ all = search.data['categorymembers']
27
+ while search['continue']
28
+ search = _categorymembers_search(cmcontinue: search['continue']['cmcontinue'])
29
+ all << search.data['categorymembers']
30
+ end
31
+ all.flatten.select { |m| (m['ns']).zero? }
32
+ end
33
+
34
+ def subcategories
35
+ search = _categorymembers_search
36
+ all = search.data['categorymembers']
37
+ all.flatten.select { |m| m['ns'] == 14 }.map { |m| m['title'] }
38
+ end
39
+
40
+ def member_ids
41
+ members.map { |m| m['pageid'] }.sort
42
+ end
43
+
44
+ def member_titles
45
+ members.map { |m| m['title'] }.sort
46
+ end
47
+
48
+ def wikidata_ids
49
+ member_ids.compact.each_slice(50).map do |ids|
50
+ page_args = {
51
+ prop: 'pageprops',
52
+ ppprop: 'wikibase_item',
53
+ redirects: 1,
54
+ pageids: ids.join('|'),
55
+ token_type: false,
56
+ }
57
+ response = client.action :query, page_args
58
+ response.data['pages'].find_all { |p| p.last.key? 'pageprops' }.map { |p| p.last['pageprops']['wikibase_item'] }
59
+ end.flatten
60
+ end
61
+ end
62
+ end