RubyGems - wikidata-fetcher - Versions diffs - 0.19.1 - Mend

wikidata-fetcher 0.19.1

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +27 -0
data/.rubocop.yml +21 -0
data/.rubocop_todo.yml +24 -0
data/.travis.yml +10 -0
data/CHANGES +67 -0
data/Gemfile +6 -0
data/LICENSE.txt +22 -0
data/README.md +68 -0
data/Rakefile +14 -0
data/lib/wikidata.rb +140 -0
data/lib/wikidata/category.rb +62 -0
data/lib/wikidata/fetcher.rb +117 -0
data/lib/wikidata/fetcher/version.rb +5 -0
data/lookup.json +356 -0
data/t/category.rb +40 -0
data/t/data.rb +139 -0
data/t/multi.rb +18 -0
data/t/test_helper.rb +12 -0
data/test/vcr_cassettes/Bierasniewa.yml +589 -0
data/test/vcr_cassettes/Eesmaa.yml +855 -0
data/test/vcr_cassettes/Eriksson.yml +1051 -0
data/test/vcr_cassettes/Lupu.yml +2080 -0
data/test/vcr_cassettes/Parts.yml +2025 -0
data/test/vcr_cassettes/R_ivas.yml +2874 -0
data/test/vcr_cassettes/Simpson.yml +750 -0
data/test/vcr_cassettes/broken.yml +2826 -0
data/test/vcr_cassettes/multi-ee.yml +2474 -0
data/test/vcr_cassettes/noclaims.yml +1568 -0
data/test/vcr_cassettes/nonhuman.yml +577 -0
data/test/vcr_cassettes/riigikogu_13.yml +926 -0
data/test/vcr_cassettes/ukmps.yml +2003 -0
data/wikidata-fetcher.gemspec +40 -0
metadata +341 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6e3918bdb5435b10a4a3c1f9ff68b5a4c8102e87
+  data.tar.gz: 271b48ba804f121f56f5b3ff35e7bdf849d20b07
+SHA512:
+  metadata.gz: e6926e4a68c676d25541fb0635b7a5337ab3e874e98c29b290e06bc60c998b3e0ecf46536ba3de3b611070b39460102a8dd6ea3ebdeec6b1586fa60b4aaff7af
+  data.tar.gz: c6aced9d0992480c08bcee60be1a1dcd5c47b385282d7554008ddfbc8251174f80b114902433aa953e1fdd0511a921a51b046fd85d5cd2308192973c3689b1b8

data/.gitignore ADDED

@@ -0,0 +1,27 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+*.bundle
+*.so
+*.o
+*.a
+mkmf.log
+*.swp
+*.cache
+.rubocop-https---raw-githubusercontent-com-everypolitician-everypolitician-data-master--rubocop-base-yml

data/.rubocop.yml ADDED

@@ -0,0 +1,21 @@
+AllCops:
+  Exclude:
+    - 'Vagrantfile'
+    - 'vendor/**/*'
+  TargetRubyVersion: 2.0
+inherit_from:
+  - https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
+  - .rubocop_todo.yml
+Style/AndOr:
+  Exclude:
+    - 'lib/wikidata/fetcher.rb'
+# depsite Rubocop's insistence, we can't replace .find_all with .select
+# everywhere, as they do very different things on a Hash
+# http://stackoverflow.com/questions/20999192/is-find-all-and-select-the-same-thing/21000136#21000136
+Style/CollectionMethods:
+  Exclude:
+    - 'lib/wikidata/category.rb'

data/.rubocop_todo.yml ADDED

@@ -0,0 +1,24 @@
+# This configuration was generated by
+# `rubocop --auto-gen-config`
+# on 2017-02-11 17:57:53 +0000 using RuboCop version 0.42.0.
+# The point is for the user to remove these configuration records
+# one by one as the offenses are removed from the code base.
+# Note that changes in the inspected code, or installation of new
+# versions of RuboCop, may require this file to be generated again.
+# Offense count: 6
+Metrics/AbcSize:
+  Max: 38
+# Offense count: 3
+Metrics/CyclomaticComplexity:
+  Max: 8
+# Offense count: 5
+# Configuration parameters: CountComments.
+Metrics/MethodLength:
+  Max: 25
+# Offense count: 2
+Metrics/PerceivedComplexity:
+  Max: 8

data/.travis.yml ADDED

@@ -0,0 +1,10 @@
+sudo: false
+language: ruby
+cache: bundler
+before_install:
+  - gem update --system --no-doc
+  - gem install bundler -v 1.14.1
+rvm:
+  - 2.3.1
+  - 2.2
+  - 2.1

data/CHANGES ADDED

@@ -0,0 +1,67 @@
+O.19.0  2016-11-26
+  - Add new EveryPolitician::Wikidata.sparql method for running SPARQL
+    queries
+O.18.0  2016-10-01
+  - We no longer cache MediaWiki API query results. If you need to do
+    something like that, you should be doing it locally.
+O.17.0  2016-06-17
+  - Can now specify an optional `table` parameter when fetching data
+    from a Morph scraper
+O.16.0  2016-06-14
+  - Better handling of errors from Wikisnakker
+O.15.0  2016-06-05
+  - Delete existing data before re-scraping
+O.14.0  2016-03-27
+  - Add a `subcategories` method on Category
+  - Move the `last_seen` to a different table
+O.13.0  2016-03-26
+  - Report on missing Wikidata IDs
+  - Add a `last_seen` column
+O.12.0  2016-03-25
+  - Add a `batch_size` option
+O.11.0  2016-03-25
+  - Move the list of want/skip properties to external file
+O.10.0  2016-02-06
+  - Remove bracketed sections at end of names
+  - Give details of which article has the unknown known
+O.9.0  2016-02-02
+  - Add WDQ lookup method
+  - Fetch data by IDs as well as names
+O.8.0  2016-01-12
+  - Fetch data in bulk where possible
+  - Add 'before' and 'after' options to XPath-finder
+0.7.2  2015-12-31
+  - Add some Scraper helper functions in EveryPolitician::Wikidata
+  - Split out a function for Category member names
+  - Additional properties
+0.7.0  2015-12-31
+0.6.4  2015-11-28
+  - Austrian Parliament ID property
+0.6.0  2015-10-04
+  - return the original pagename where redirected
+0.5.0  2015-09-23
+  - fetch _all_ names and wikipedia links, not just in the requested
+    languages
+0.4.1  2015-09-23
+  - Additional properties
+0.4.0  2015-09-12
+  - resolve redirects automatically

data/Gemfile ADDED

@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+gem 'wikisnakker', git: 'https://github.com/everypolitician/wikisnakker', branch: 'master'
+# Specify your gem's dependencies in wikidata-fetcher.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2015 Tony Bowden
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,68 @@
+# Wikidata::Fetcher
+Fetch information useful to EveryPolitician from Wikidata
+## Installation
+Add this line to your application's Gemfile:
+    gem "wikidata-fetcher", git: "https://github.com/everypolitician/wikidata-fetcher.git"
+## Usage
+```
+require 'wikidata/fetcher'
+#------------------------------------------
+# Step 1: Get a list of Wikipedia pagenames
+#------------------------------------------
+# from a Wikipedia page, by XPath
+en_names = EveryPolitician::Wikidata.wikipedia_xpath(
+  url: 'https://en.wikipedia.org/wiki/Template:Peruvian_Congress_2011-2016',
+  xpath: '//table//td[contains(@class,"navbox-list")]//li//a[not(@class="new")]/@title',
+)
+# or from a Wikipedia Category
+es_names = WikiData::Category.new( 'Categoría:Congresistas de Perú 2011-2016', 'es').member_titles
+# or from a Morph scraper
+names = EveryPolitician::Wikidata.morph_wikinames(source: 'tmtmtmtm/tuvalu-parliament-wikipedia', column: 'wikiname')
+# or from a SPARQL query
+ids = EveryPolitician::Wikidata.sparql('SELECT ?item WHERE { ?item wdt:P39 wd:Q18229570 . }')
+# or from a WDQ query
+ids = EveryPolitician::Wikidata.wdq('claim[463:21124329]')
+#-----------------------------------------------------------
+# Step 2: Scrape the data from Wikidata based on these names
+#-----------------------------------------------------------
+EveryPolitician::Wikidata.scrape_wikidata(names: { en: names })
+# NB: this can take multiple lists, and can also output the data as it fetches it:
+EveryPolitician::Wikidata.scrape_wikidata(names: {
+  es: es_names,
+  en: en_names,
+}, output: true)
+#-----------------------------
+# Step 3: Notify the Rebuilder
+#-----------------------------
+EveryPolitician::Wikidata.notify_rebuilder
+(This requires MORPH_REBUILDER_URL to be set in the environment)
+```
+## Contributing
+1. Fork it ( https://github.com/everypolitician/wikidata-fetcher/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED

@@ -0,0 +1,14 @@
+require 'bundler/gem_tasks'
+require 'rake/testtask'
+require 'rubocop/rake_task'
+RuboCop::RakeTask.new
+Rake::TestTask.new(:test) do |t|
+  t.libs << 't'
+  t.libs << 'lib'
+  t.test_files = FileList['t/*.rb']
+end
+task default: %i(test rubocop)

data/lib/wikidata.rb ADDED

@@ -0,0 +1,140 @@
+require 'colorize'
+require 'digest/sha1'
+require 'json'
+require 'mediawiki_api'
+require 'require_all'
+require 'wikisnakker'
+require_rel '.'
+class WikiData
+  def self.ids_from_pages(lang, titles)
+    client = MediawikiApi::Client.new "https://#{lang}.wikipedia.org/w/api.php"
+    res = titles.compact.each_slice(50).map do |sliced|
+      page_args = {
+        prop:       'pageprops',
+        ppprop:     'wikibase_item',
+        redirects:  1,
+        titles:     sliced.join('|'),
+        token_type: false,
+      }
+      response = client.action :query, page_args
+      redirected_from = Hash[(response.data['redirects'] || []).map { |h| [h['to'], h['from']] }]
+      response.data['pages'].select { |_k, v| v.key? 'pageprops' }.map do |_k, v|
+        [redirected_from[v['title']] || v['title'], v['pageprops']['wikibase_item']]
+      end
+    end
+    results = Hash[res.flatten(1)]
+    missing = titles - results.keys
+    warn "Can't find Wikidata IDs for: #{missing.join(', ')} in #{lang}" if missing.any?
+    results
+  end
+end
+module EveryPolitician
+  module Wikidata
+    WDQ_URL = 'https://wdq.wmflabs.org/api'.freeze
+    def self.wdq(query)
+      result = RestClient.get WDQ_URL, params: { q: query }
+      json = JSON.parse(result, symbolize_names: true)
+      json[:items].map { |id| "Q#{id}" }
+    end
+    WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql'.freeze
+    def self.sparql(query)
+      result = RestClient.get WIKIDATA_SPARQL_URL, params: { query: query, format: 'json' }
+      json = JSON.parse(result, symbolize_names: true)
+      json[:results][:bindings].map { |res| res[:item][:value].split('/').last }
+    rescue RestClient::Exception => e
+      raise "Wikidata query #{query} failed: #{e.message}"
+    end
+    require 'rest-client'
+    def self.morph_wikinames(h)
+      morph_api_url = 'https://api.morph.io/%s/data.json' % h[:source]
+      morph_api_key = ENV['MORPH_API_KEY']
+      table = h[:table] || 'data'
+      result = RestClient.get morph_api_url, params: {
+        key:   morph_api_key,
+        query: "SELECT DISTINCT(#{h[:column]}) AS wikiname FROM #{table}",
+      }
+      JSON.parse(result, symbolize_names: true).map { |e| e[:wikiname] }.reject { |n| n.to_s.empty? }
+    end
+    require 'pry'
+    def self.wikipedia_xpath(h)
+      noko = noko_for(URI.decode(h[:url]))
+      if h[:after]
+        point = noko.xpath(h[:after])
+        raise "Can't find #{h[:after]}" if point.empty?
+        point.xpath('.//preceding::*').remove
+      end
+      if h[:before]
+        point = noko.xpath(h[:before])
+        raise "Can't find #{h[:before]}" if point.empty?
+        point.xpath('.//following::*').remove
+      end
+      names = noko.xpath(h[:xpath]).map(&:text).uniq
+      binding.pry if h[:debug] == true
+      raise "No names found in #{h[:url]}" if names.count.zero?
+      names
+    end
+    require 'open-uri'
+    require 'nokogiri'
+    def self.noko_for(url)
+      Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read)
+    end
+    #-------------------------------------------------------------------
+    require 'scraperwiki'
+    def self.scrape_wikidata(h)
+      langs = ((h[:lang] || (h[:names] ||= {}).keys) + [:en]).flatten.uniq
+      langpairs = h[:names].map { |lang, names| WikiData.ids_from_pages(lang.to_s, names) }
+      combined  = langpairs.reduce({}) { |a, e| a.merge(e.invert) }
+      (h[:ids] ||= []).each { |id| combined[id] ||= nil }
+      # Clean out existing data
+      ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
+      Hash[combined.to_a.shuffle].each_slice(h[:batch_size] || 10_000) do |slice|
+        sliced = Hash[slice]
+        found = WikiData::Fetcher.find(sliced.keys)
+        sliced.each do |id, name|
+          unless found[id]
+            warn "No data for #{id}"
+            next
+          end
+          begin
+            data = found[id].data(langs)
+          rescue StandardError => e
+            warn "Problem with #{id}: #{e}"
+            next
+          end
+          next unless data
+          data[:original_wikiname] = name
+          puts data if h[:output] == true
+          ScraperWiki.save_sqlite([:id], data)
+        end
+      end
+    end
+    #-------------------------------------------------------------------
+    require 'rest-client'
+    def self.notify_rebuilder
+      RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']
+    end
+  end
+end

data/lib/wikidata/category.rb ADDED

@@ -0,0 +1,62 @@
+require 'mediawiki_api'
+class WikiData
+  class Category < WikiData
+    def initialize(page, lang = 'en')
+      @_page = page
+      @_lang = lang
+    end
+    def client
+      @_client ||= MediawikiApi::Client.new "https://#{@_lang}.wikipedia.org/w/api.php"
+    end
+    def _categorymembers_search(args = {})
+      cat_args = {
+        cmtitle:    @_page,
+        token_type: false,
+        list:       'categorymembers',
+        cmlimit:    '500',
+      }.merge(args)
+      client.action :query, cat_args
+    end
+    def members
+      search = _categorymembers_search
+      all = search.data['categorymembers']
+      while search['continue']
+        search = _categorymembers_search(cmcontinue: search['continue']['cmcontinue'])
+        all << search.data['categorymembers']
+      end
+      all.flatten.select { |m| (m['ns']).zero? }
+    end
+    def subcategories
+      search = _categorymembers_search
+      all = search.data['categorymembers']
+      all.flatten.select { |m| m['ns'] == 14 }.map { |m| m['title'] }
+    end
+    def member_ids
+      members.map { |m| m['pageid'] }.sort
+    end
+    def member_titles
+      members.map { |m| m['title'] }.sort
+    end
+    def wikidata_ids
+      member_ids.compact.each_slice(50).map do |ids|
+        page_args = {
+          prop:       'pageprops',
+          ppprop:     'wikibase_item',
+          redirects:  1,
+          pageids:    ids.join('|'),
+          token_type: false,
+        }
+        response = client.action :query, page_args
+        response.data['pages'].find_all { |p| p.last.key? 'pageprops' }.map { |p| p.last['pageprops']['wikibase_item'] }
+      end.flatten
+    end
+  end
+end