RubyGems - wikidata-fetcher - Versions diffs - 0.19.1 - Mend

wikidata-fetcher 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +27 -0
data/.rubocop.yml +21 -0
data/.rubocop_todo.yml +24 -0
data/.travis.yml +10 -0
data/CHANGES +67 -0
data/Gemfile +6 -0
data/LICENSE.txt +22 -0
data/README.md +68 -0
data/Rakefile +14 -0
data/lib/wikidata.rb +140 -0
data/lib/wikidata/category.rb +62 -0
data/lib/wikidata/fetcher.rb +117 -0
data/lib/wikidata/fetcher/version.rb +5 -0
data/lookup.json +356 -0
data/t/category.rb +40 -0
data/t/data.rb +139 -0
data/t/multi.rb +18 -0
data/t/test_helper.rb +12 -0
data/test/vcr_cassettes/Bierasniewa.yml +589 -0
data/test/vcr_cassettes/Eesmaa.yml +855 -0
data/test/vcr_cassettes/Eriksson.yml +1051 -0
data/test/vcr_cassettes/Lupu.yml +2080 -0
data/test/vcr_cassettes/Parts.yml +2025 -0
data/test/vcr_cassettes/R_ivas.yml +2874 -0
data/test/vcr_cassettes/Simpson.yml +750 -0
data/test/vcr_cassettes/broken.yml +2826 -0
data/test/vcr_cassettes/multi-ee.yml +2474 -0
data/test/vcr_cassettes/noclaims.yml +1568 -0
data/test/vcr_cassettes/nonhuman.yml +577 -0
data/test/vcr_cassettes/riigikogu_13.yml +926 -0
data/test/vcr_cassettes/ukmps.yml +2003 -0
data/wikidata-fetcher.gemspec +40 -0
metadata +341 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6e3918bdb5435b10a4a3c1f9ff68b5a4c8102e87
+  data.tar.gz: 271b48ba804f121f56f5b3ff35e7bdf849d20b07
+SHA512:
+  metadata.gz: e6926e4a68c676d25541fb0635b7a5337ab3e874e98c29b290e06bc60c998b3e0ecf46536ba3de3b611070b39460102a8dd6ea3ebdeec6b1586fa60b4aaff7af
+  data.tar.gz: c6aced9d0992480c08bcee60be1a1dcd5c47b385282d7554008ddfbc8251174f80b114902433aa953e1fdd0511a921a51b046fd85d5cd2308192973c3689b1b8

data/.gitignore ADDED

@@ -0,0 +1,27 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+*.bundle
+*.so
+*.o
+*.a
+mkmf.log
+*.swp
+*.cache
+.rubocop-https---raw-githubusercontent-com-everypolitician-everypolitician-data-master--rubocop-base-yml

data/.rubocop.yml ADDED

@@ -0,0 +1,21 @@
+AllCops:
+  Exclude:
+    - 'Vagrantfile'
+    - 'vendor/**/*'
+  TargetRubyVersion: 2.0
+inherit_from:
+  - https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
+  - .rubocop_todo.yml
+Style/AndOr:
+  Exclude:
+    - 'lib/wikidata/fetcher.rb'
+# depsite Rubocop's insistence, we can't replace .find_all with .select
+# everywhere, as they do very different things on a Hash
+# http://stackoverflow.com/questions/20999192/is-find-all-and-select-the-same-thing/21000136#21000136
+Style/CollectionMethods:
+  Exclude:
+    - 'lib/wikidata/category.rb'

data/.rubocop_todo.yml ADDED

@@ -0,0 +1,24 @@
+# This configuration was generated by
+# `rubocop --auto-gen-config`
+# on 2017-02-11 17:57:53 +0000 using RuboCop version 0.42.0.
+# The point is for the user to remove these configuration records
+# one by one as the offenses are removed from the code base.
+# Note that changes in the inspected code, or installation of new
+# versions of RuboCop, may require this file to be generated again.
+# Offense count: 6
+Metrics/AbcSize:
+  Max: 38
+# Offense count: 3
+Metrics/CyclomaticComplexity:
+  Max: 8
+# Offense count: 5
+# Configuration parameters: CountComments.
+Metrics/MethodLength:
+  Max: 25
+# Offense count: 2
+Metrics/PerceivedComplexity:
+  Max: 8

data/.travis.yml ADDED

@@ -0,0 +1,10 @@
+sudo: false
+language: ruby
+cache: bundler
+before_install:
+  - gem update --system --no-doc
+  - gem install bundler -v 1.14.1
+rvm:
+  - 2.3.1
+  - 2.2
+  - 2.1

data/CHANGES ADDED

@@ -0,0 +1,67 @@
+O.19.0  2016-11-26
+  - Add new EveryPolitician::Wikidata.sparql method for running SPARQL
+    queries
+O.18.0  2016-10-01
+  - We no longer cache MediaWiki API query results. If you need to do
+    something like that, you should be doing it locally.
+O.17.0  2016-06-17
+  - Can now specify an optional `table` parameter when fetching data
+    from a Morph scraper
+O.16.0  2016-06-14
+  - Better handling of errors from Wikisnakker
+O.15.0  2016-06-05
+  - Delete existing data before re-scraping
+O.14.0  2016-03-27
+  - Add a `subcategories` method on Category
+  - Move the `last_seen` to a different table
+O.13.0  2016-03-26
+  - Report on missing Wikidata IDs
+  - Add a `last_seen` column
+O.12.0  2016-03-25
+  - Add a `batch_size` option
+O.11.0  2016-03-25
+  - Move the list of want/skip properties to external file
+O.10.0  2016-02-06
+  - Remove bracketed sections at end of names
+  - Give details of which article has the unknown known
+O.9.0  2016-02-02
+  - Add WDQ lookup method
+  - Fetch data by IDs as well as names
+O.8.0  2016-01-12
+  - Fetch data in bulk where possible
+  - Add 'before' and 'after' options to XPath-finder
+0.7.2  2015-12-31
+  - Add some Scraper helper functions in EveryPolitician::Wikidata
+  - Split out a function for Category member names
+  - Additional properties
+0.7.0  2015-12-31
+0.6.4  2015-11-28
+  - Austrian Parliament ID property
+0.6.0  2015-10-04
+  - return the original pagename where redirected
+0.5.0  2015-09-23
+  - fetch _all_ names and wikipedia links, not just in the requested
+    languages
+0.4.1  2015-09-23
+  - Additional properties
+0.4.0  2015-09-12
+  - resolve redirects automatically

data/Gemfile ADDED

@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+gem 'wikisnakker', git: 'https://github.com/everypolitician/wikisnakker', branch: 'master'
+# Specify your gem's dependencies in wikidata-fetcher.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2015 Tony Bowden
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,68 @@
+# Wikidata::Fetcher
+Fetch information useful to EveryPolitician from Wikidata
+## Installation
+Add this line to your application's Gemfile:
+    gem "wikidata-fetcher", git: "https://github.com/everypolitician/wikidata-fetcher.git"
+## Usage
+```
+require 'wikidata/fetcher'
+#------------------------------------------
+# Step 1: Get a list of Wikipedia pagenames
+#------------------------------------------
+# from a Wikipedia page, by XPath
+en_names = EveryPolitician::Wikidata.wikipedia_xpath(
+  url: 'https://en.wikipedia.org/wiki/Template:Peruvian_Congress_2011-2016',
+  xpath: '//table//td[contains(@class,"navbox-list")]//li//a[not(@class="new")]/@title',
+)
+# or from a Wikipedia Category
+es_names = WikiData::Category.new( 'Categoría:Congresistas de Perú 2011-2016', 'es').member_titles
+# or from a Morph scraper
+names = EveryPolitician::Wikidata.morph_wikinames(source: 'tmtmtmtm/tuvalu-parliament-wikipedia', column: 'wikiname')
+# or from a SPARQL query
+ids = EveryPolitician::Wikidata.sparql('SELECT ?item WHERE { ?item wdt:P39 wd:Q18229570 . }')
+# or from a WDQ query
+ids = EveryPolitician::Wikidata.wdq('claim[463:21124329]')
+#-----------------------------------------------------------
+# Step 2: Scrape the data from Wikidata based on these names
+#-----------------------------------------------------------
+EveryPolitician::Wikidata.scrape_wikidata(names: { en: names })
+# NB: this can take multiple lists, and can also output the data as it fetches it:
+EveryPolitician::Wikidata.scrape_wikidata(names: {
+  es: es_names,
+  en: en_names,
+}, output: true)
+#-----------------------------
+# Step 3: Notify the Rebuilder
+#-----------------------------
+EveryPolitician::Wikidata.notify_rebuilder
+(This requires MORPH_REBUILDER_URL to be set in the environment)
+```
+## Contributing
+1. Fork it ( https://github.com/everypolitician/wikidata-fetcher/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED

@@ -0,0 +1,14 @@
+require 'bundler/gem_tasks'
+require 'rake/testtask'
+require 'rubocop/rake_task'
+RuboCop::RakeTask.new
+Rake::TestTask.new(:test) do |t|
+  t.libs << 't'
+  t.libs << 'lib'
+  t.test_files = FileList['t/*.rb']
+end
+task default: %i(test rubocop)

data/lib/wikidata.rb ADDED

@@ -0,0 +1,140 @@
+require 'colorize'
+require 'digest/sha1'
+require 'json'
+require 'mediawiki_api'
+require 'require_all'
+require 'wikisnakker'
+require_rel '.'
+class WikiData
+  def self.ids_from_pages(lang, titles)
+    client = MediawikiApi::Client.new "https://#{lang}.wikipedia.org/w/api.php"
+    res = titles.compact.each_slice(50).map do |sliced|
+      page_args = {
+        prop:       'pageprops',
+        ppprop:     'wikibase_item',
+        redirects:  1,
+        titles:     sliced.join('|'),
+        token_type: false,
+      }
+      response = client.action :query, page_args
+      redirected_from = Hash[(response.data['redirects'] || []).map { |h| [h['to'], h['from']] }]
+      response.data['pages'].select { |_k, v| v.key? 'pageprops' }.map do |_k, v|
+        [redirected_from[v['title']] || v['title'], v['pageprops']['wikibase_item']]
+      end
+    end
+    results = Hash[res.flatten(1)]
+    missing = titles - results.keys
+    warn "Can't find Wikidata IDs for: #{missing.join(', ')} in #{lang}" if missing.any?
+    results
+  end
+end
+module EveryPolitician
+  module Wikidata
+    WDQ_URL = 'https://wdq.wmflabs.org/api'.freeze
+    def self.wdq(query)
+      result = RestClient.get WDQ_URL, params: { q: query }
+      json = JSON.parse(result, symbolize_names: true)
+      json[:items].map { |id| "Q#{id}" }
+    end
+    WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql'.freeze
+    def self.sparql(query)
+      result = RestClient.get WIKIDATA_SPARQL_URL, params: { query: query, format: 'json' }
+      json = JSON.parse(result, symbolize_names: true)
+      json[:results][:bindings].map { |res| res[:item][:value].split('/').last }
+    rescue RestClient::Exception => e
+      raise "Wikidata query #{query} failed: #{e.message}"
+    end
+    require 'rest-client'
+    def self.morph_wikinames(h)
+      morph_api_url = 'https://api.morph.io/%s/data.json' % h[:source]
+      morph_api_key = ENV['MORPH_API_KEY']
+      table = h[:table] || 'data'
+      result = RestClient.get morph_api_url, params: {
+        key:   morph_api_key,
+        query: "SELECT DISTINCT(#{h[:column]}) AS wikiname FROM #{table}",
+      }
+      JSON.parse(result, symbolize_names: true).map { |e| e[:wikiname] }.reject { |n| n.to_s.empty? }
+    end
+    require 'pry'
+    def self.wikipedia_xpath(h)
+      noko = noko_for(URI.decode(h[:url]))
+      if h[:after]
+        point = noko.xpath(h[:after])
+        raise "Can't find #{h[:after]}" if point.empty?
+        point.xpath('.//preceding::*').remove
+      end
+      if h[:before]
+        point = noko.xpath(h[:before])
+        raise "Can't find #{h[:before]}" if point.empty?
+        point.xpath('.//following::*').remove
+      end
+      names = noko.xpath(h[:xpath]).map(&:text).uniq
+      binding.pry if h[:debug] == true
+      raise "No names found in #{h[:url]}" if names.count.zero?
+      names
+    end
+    require 'open-uri'
+    require 'nokogiri'
+    def self.noko_for(url)
+      Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read)
+    end
+    #-------------------------------------------------------------------
+    require 'scraperwiki'
+    def self.scrape_wikidata(h)
+      langs = ((h[:lang] || (h[:names] ||= {}).keys) + [:en]).flatten.uniq
+      langpairs = h[:names].map { |lang, names| WikiData.ids_from_pages(lang.to_s, names) }
+      combined  = langpairs.reduce({}) { |a, e| a.merge(e.invert) }
+      (h[:ids] ||= []).each { |id| combined[id] ||= nil }
+      # Clean out existing data
+      ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
+      Hash[combined.to_a.shuffle].each_slice(h[:batch_size] || 10_000) do |slice|
+        sliced = Hash[slice]
+        found = WikiData::Fetcher.find(sliced.keys)
+        sliced.each do |id, name|
+          unless found[id]
+            warn "No data for #{id}"
+            next
+          end
+          begin
+            data = found[id].data(langs)
+          rescue StandardError => e
+            warn "Problem with #{id}: #{e}"
+            next
+          end
+          next unless data
+          data[:original_wikiname] = name
+          puts data if h[:output] == true
+          ScraperWiki.save_sqlite([:id], data)
+        end
+      end
+    end
+    #-------------------------------------------------------------------
+    require 'rest-client'
+    def self.notify_rebuilder
+      RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']
+    end
+  end
+end

data/lib/wikidata/category.rb ADDED

@@ -0,0 +1,62 @@
+require 'mediawiki_api'
+class WikiData
+  class Category < WikiData
+    def initialize(page, lang = 'en')
+      @_page = page
+      @_lang = lang
+    end
+    def client
+      @_client ||= MediawikiApi::Client.new "https://#{@_lang}.wikipedia.org/w/api.php"
+    end
+    def _categorymembers_search(args = {})
+      cat_args = {
+        cmtitle:    @_page,
+        token_type: false,
+        list:       'categorymembers',
+        cmlimit:    '500',
+      }.merge(args)
+      client.action :query, cat_args
+    end
+    def members
+      search = _categorymembers_search
+      all = search.data['categorymembers']
+      while search['continue']
+        search = _categorymembers_search(cmcontinue: search['continue']['cmcontinue'])
+        all << search.data['categorymembers']
+      end
+      all.flatten.select { |m| (m['ns']).zero? }
+    end
+    def subcategories
+      search = _categorymembers_search
+      all = search.data['categorymembers']
+      all.flatten.select { |m| m['ns'] == 14 }.map { |m| m['title'] }
+    end
+    def member_ids
+      members.map { |m| m['pageid'] }.sort
+    end
+    def member_titles
+      members.map { |m| m['title'] }.sort
+    end
+    def wikidata_ids
+      member_ids.compact.each_slice(50).map do |ids|
+        page_args = {
+          prop:       'pageprops',
+          ppprop:     'wikibase_item',
+          redirects:  1,
+          pageids:    ids.join('|'),
+          token_type: false,
+        }
+        response = client.action :query, page_args
+        response.data['pages'].find_all { |p| p.last.key? 'pageprops' }.map { |p| p.last['pageprops']['wikibase_item'] }
+      end.flatten
+    end
+  end
+end