wikidata-fetcher 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,117 @@
1
+ require 'json'
2
+ require 'open-uri'
3
+ require 'require_all'
4
+ require 'wikisnakker'
5
+
6
+ require_rel '..'
7
+
8
+ class WikiData
9
+ class Fetcher < WikiData
10
+ LOOKUP_FILE = 'https://raw.githubusercontent.com/everypolitician/wikidata-fetcher/master/lookup.json'.freeze
11
+
12
+ def self.find(ids)
13
+ Hash[Wikisnakker::Item.find(ids).map { |wditem| [wditem.id, new(item: wditem)] }]
14
+ end
15
+
16
+ def self.wikidata_properties
17
+ @wikidata_properties ||= JSON.parse(open(LOOKUP_FILE).read, symbolize_names: true)
18
+ end
19
+
20
+ def initialize(h)
21
+ if h[:id]
22
+ @item = Wikisnakker::Item.find(h[:id]) or raise "No such item #{h[:id]}"
23
+ @id = @item.id or raise "No ID for #{h[:id]} = #{@item}"
24
+ warn "Different ID (#{@id}) for #{h[:id]}" if @id != h[:id]
25
+ elsif h[:item]
26
+ # Already have a Wikisnakker::Item, eg from a bulk lookup
27
+ @item = h[:item]
28
+ @id = @item.id or raise "No ID for #{h[:id]} = #{@item}"
29
+ else
30
+ raise 'No id'
31
+ end
32
+ end
33
+
34
+ def data(*lang)
35
+ return unless item
36
+
37
+ data = {
38
+ id: id,
39
+ name: first_label_used(lang | ['en']),
40
+ }.merge(labels).merge(wikipedia_links)
41
+
42
+ # Short-circuit if there are no claims
43
+ return data if item.properties.empty?
44
+
45
+ # Short-circuit if this is not a human
46
+ unless human?
47
+ warn "‼ #{id} is_instance_of #{type.join(' & ')}. Skipping"
48
+ return nil
49
+ end
50
+
51
+ unknown_properties.each do |p|
52
+ puts "⁇ Unknown property: https://www.wikidata.org/wiki/Property:#{p} for #{id}"
53
+ end
54
+
55
+ wanted_properties.each do |p|
56
+ val = property_value(p)
57
+ next warn "Unknown value for #{p} for #{id}" unless val
58
+ data[want[p].to_sym] = val
59
+ end
60
+
61
+ data
62
+ end
63
+
64
+ private
65
+
66
+ attr_reader :item, :id
67
+
68
+ def skip
69
+ @skip ||= self.class.wikidata_properties[:skip]
70
+ end
71
+
72
+ def want
73
+ @want ||= self.class.wikidata_properties[:want]
74
+ end
75
+
76
+ def type
77
+ item.P31s.map { |p| p.value.label('en') }
78
+ end
79
+
80
+ def human?
81
+ type.include? 'human'
82
+ end
83
+
84
+ def unknown_properties
85
+ item.properties.reject { |c| skip[c] || want[c] }
86
+ end
87
+
88
+ def wanted_properties
89
+ item.properties & want.keys
90
+ end
91
+
92
+ def labels
93
+ # remove any bracketed element at the end
94
+ Hash[item.labels.map do |k, v|
95
+ ["name__#{k.to_s.tr('-', '_')}".to_sym, v[:value].sub(/ \(.*?\)$/, '')]
96
+ end]
97
+ end
98
+
99
+ def wikipedia_links
100
+ Hash[item.sitelinks.map do |k, v|
101
+ ["wikipedia__#{k.to_s.sub(/wiki$/, '')}".to_sym, v.title]
102
+ end]
103
+ end
104
+
105
+ def property_value(property)
106
+ val = item[property].value rescue nil or return
107
+ return val unless val.respond_to?(:label)
108
+ return unless val.labels
109
+ val.label('en')
110
+ end
111
+
112
+ def first_label_used(language_codes)
113
+ prefered = (item.labels.keys & language_codes.map(&:to_sym)).first or return
114
+ item.labels[prefered][:value]
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,5 @@
1
+ module Wikidata
2
+ module Fetcher
3
+ VERSION = '0.19.1'.freeze
4
+ end
5
+ end
@@ -0,0 +1,356 @@
1
+ {
2
+ "skip": {
3
+ "P7": "Brother",
4
+ "P9": "Sister",
5
+ "P10": "video",
6
+ "P17": "Country",
7
+ "P19": "Place of Birth",
8
+ "P20": "Place of Death",
9
+ "P22": "Father",
10
+ "P25": "Mother",
11
+ "P26": "Spouse",
12
+ "P27": "Country of Citizenship",
13
+ "P31": "Instance of",
14
+ "P39": "Position Held",
15
+ "P40": "Child",
16
+ "P43": "Stepfather",
17
+ "P51": "audio",
18
+ "P53": "noble family",
19
+ "P54": "Member of sports team",
20
+ "P66": "Ancestral home",
21
+ "P69": "Educated at",
22
+ "P91": "Sexual orientation",
23
+ "P94": "coat of arms image",
24
+ "P101": "Field of Work",
25
+ "P103": "Native language",
26
+ "P102": "Party",
27
+ "P108": "Employer",
28
+ "P109": "Signature",
29
+ "P119": "Place of burial",
30
+ "P135": "movement",
31
+ "P136": "genre",
32
+ "P138": "named after",
33
+ "P140": "Religion",
34
+ "P155": "follows",
35
+ "P156": "followed by",
36
+ "P157": "killed by",
37
+ "P166": "Award received",
38
+ "P172": "Ethnic group",
39
+ "P184": "Doctoral advisor",
40
+ "P241": "Military branch",
41
+ "P361": "party of",
42
+ "P373": "Commons category",
43
+ "P410": "Military rank",
44
+ "P412": "voice type",
45
+ "P413": "position on team",
46
+ "P425": "field of this profession",
47
+ "P428": "Botanist author",
48
+ "P443": "Pronunciation audio",
49
+ "P450": "Astronaut mission",
50
+ "P451": "Cohabitant",
51
+ "P463": "Member of",
52
+ "P485": "Archives at",
53
+ "P488": "Chairperson",
54
+ "P495": "Country of origin",
55
+ "P509": "Cause of death",
56
+ "P512": "Academic degree",
57
+ "P535": "Find a Grave",
58
+ "P551": "Residence",
59
+ "P552": "handedness",
60
+ "P555": "Tennis doubles record",
61
+ "P564": "Tennis singles record",
62
+ "P598": "Commander of",
63
+ "P607": "Conflicts",
64
+ "P641": "Sport",
65
+ "P647": "drafted by",
66
+ "P650": "RKDartists",
67
+ "P737": "influenced by",
68
+ "P741": "tennis playing hand",
69
+ "P793": "significant event",
70
+ "P800": "Notable work",
71
+ "P802": "student",
72
+ "P812": "Academic major",
73
+ "P866": "Perlentaucher ID",
74
+ "P898": "IPA",
75
+ "P900": "<deleted>",
76
+ "P910": "Main category",
77
+ "P935": "Commons gallery",
78
+ "P937": "Work location",
79
+ "P941": "Inspired by",
80
+ "P948": "Wikivoyage banner",
81
+ "P990": "voice recording",
82
+ "P1019": "feed URL",
83
+ "P1026": "doctoral thesis",
84
+ "P1038": "Relative",
85
+ "P1050": "Medical condition",
86
+ "P1066": "Student of",
87
+ "P1087": "Elo rating",
88
+ "P1151": "Wikimedia portal",
89
+ "P1185": "Rodovid ID",
90
+ "P1196": "Manner of death",
91
+ "P1220": "Broadway Database ID",
92
+ "P1233": "Speculative fiction DB",
93
+ "P1317": "floruit",
94
+ "P1303": "instrument played",
95
+ "P1321": "Place of Origin (Swiss)",
96
+ "P1343": "Described by source",
97
+ "P1344": "Participant in",
98
+ "P1399": "Convicted of",
99
+ "P1409": "Cycling Archives ID",
100
+ "P1411": "nominated for",
101
+ "P1412": "Languages",
102
+ "P1416": "affiliation",
103
+ "P1429": "pet",
104
+ "P1440": "Identifier FIDE",
105
+ "P1442": "Image of grave",
106
+ "P1447": "SportsReference ID",
107
+ "P1448": "Official name",
108
+ "P1449": "nickname",
109
+ "P1472": "Commons Creator page",
110
+ "P1477": "birth_name",
111
+ "P1532": "Country for sport",
112
+ "P1559": "Name in native language",
113
+ "P1563": "MacTutor id",
114
+ "P1576": "lifestyle",
115
+ "P1618": "sport number",
116
+ "P1665": "Chess Games ID",
117
+ "P1683": "quote",
118
+ "P1728": "AllMusic ID",
119
+ "P1801": "commemorative plaque",
120
+ "P1819": "genealogics ID",
121
+ "P1825": "Baseball-Reference.com ID",
122
+ "P1826": "Baseball-Reference.com ID",
123
+ "P1853": "blood type",
124
+ "P1889": "different from",
125
+ "P1967": "boxrec ID",
126
+ "P1971": "Number of children",
127
+ "P2020": "worldfootball.net",
128
+ "P2021": "Erdős number",
129
+ "P2042": "Artsy ID",
130
+ "P2048": "height",
131
+ "P2067": "mass",
132
+ "P2091": "FISA ID",
133
+ "P2174": "MOMA ID",
134
+ "P2193": "Soccerbase ID",
135
+ "P2276": "UEFA ID",
136
+ "P2350": "Speedskatingbase.eu ID",
137
+ "P2416": "sports discipline",
138
+ "P2423": "FIE fencer ID",
139
+ "P2446": "transfermarkt player ID",
140
+ "P2447": "transfermarkt manager ID",
141
+ "P2448": "Turkish Football Federation player ID",
142
+ "P2449": "Turkish Football Federation manager ID",
143
+ "P2458": "Mackolik ID",
144
+ "P2574": "NFT ID",
145
+ "P2593": "Latvian Olympic athlete ID",
146
+ "P2632": "place of detention",
147
+ "P2640": "SwimRankings ID",
148
+ "P2696": "FIG gymnast ID",
149
+ "P2727": "United World Wrestling ID",
150
+ "P2728": "CageMatch ID",
151
+ "P2767": "JudoInside ID",
152
+ "P2773": "FIS cross-country skier ID",
153
+ "P2779": "IAT weightlifter ID",
154
+ "P2796": "FIG gymnast ID",
155
+ "P2818": "Sherdog ID",
156
+ "P3065": "identifier__rero",
157
+ "P3172": "World Bridge Federation ID",
158
+ "P3373": "sibling"
159
+ },
160
+ "want": {
161
+ "P18": "image",
162
+ "P21": "gender",
163
+ "P97": "noble_title",
164
+ "P106": "occupation",
165
+ "P213": "identifier__ISNI",
166
+ "P214": "identifier__VIAF",
167
+ "P227": "identifier__GND",
168
+ "P244": "identifier__LCAuth",
169
+ "P245": "identifier__ULAN",
170
+ "P268": "identifier__BNF",
171
+ "P269": "identifier__SUDOC",
172
+ "P271": "identifier__CiNii",
173
+ "P345": "identifier__IMDB",
174
+ "P349": "identifier__NDL",
175
+ "P396": "identifier__SBN_it",
176
+ "P409": "identifier__NLA",
177
+ "P434": "identifier__MusicBrainz",
178
+ "P496": "identifier__ORCID",
179
+ "P511": "honorific_prefix",
180
+ "P513": "birth_name",
181
+ "P536": "identifier__ATP",
182
+ "P549": "identifier__MGP",
183
+ "P553": "website",
184
+ "P569": "birth_date",
185
+ "P570": "death_date",
186
+ "P599": "identifier__ITF",
187
+ "P640": "identifier__leonore",
188
+ "P646": "identifier__freebase",
189
+ "P648": "identifier__OLID",
190
+ "P651": "identifier__BPN",
191
+ "P691": "identifier__NKC",
192
+ "P723": "identifier__DBNL",
193
+ "P734": "family_name",
194
+ "P735": "given_name",
195
+ "P742": "pseudonym",
196
+ "P768": "electoral_district",
197
+ "P856": "website",
198
+ "P865": "identifier__BMLO",
199
+ "P902": "identifier__HDS",
200
+ "P906": "identifier__SELIBR",
201
+ "P947": "identifier__RSL",
202
+ "P949": "identifier__NLI",
203
+ "P950": "identifier__BNE",
204
+ "P951": "identifier__NSZL",
205
+ "P968": "email",
206
+ "P998": "identifier__dmoz",
207
+ "P1005": "identifier__PTBNP",
208
+ "P1006": "identifier__NTA",
209
+ "P1015": "identifier__BIBSYS",
210
+ "P1025": "identifier__SUDOC",
211
+ "P1017": "identifier__BAV",
212
+ "P1035": "honorific_suffix",
213
+ "P1045": "identifier__sycomore",
214
+ "P1047": "identifier__catholic_hierarchy",
215
+ "P1048": "identifier__NCL",
216
+ "P1146": "identifier__IIAF",
217
+ "P1157": "identifier__UScongress",
218
+ "P1186": "identifier__EuroparlMEP",
219
+ "P1207": "identifier__NUKAT",
220
+ "P1213": "identifier__NLC",
221
+ "P1214": "identifier__Riksdagen",
222
+ "P1229": "identifier__openpolis",
223
+ "P1258": "identifier__rotten_tomatoes",
224
+ "P1263": "identifier__NNDB",
225
+ "P1266": "identifier__AlloCine",
226
+ "P1273": "identifier__CANTIC",
227
+ "P1280": "identifier__CONOR",
228
+ "P1284": "identifier__Munzinger",
229
+ "P1285": "identifier__Munzinger",
230
+ "P1288": "identifier__Munzinger",
231
+ "P1289": "identifier__KLfG",
232
+ "P1291": "identifier__ADS",
233
+ "P1296": "identifier__GNC",
234
+ "P1307": "identifier__parlamentDOTch",
235
+ "P1309": "identifier__EGAXA",
236
+ "P1315": "identifier__NLAtrove",
237
+ "P1331": "identifier__PACE",
238
+ "P1341": "identifier__italian_cod",
239
+ "P1368": "identifier__LNB",
240
+ "P1375": "identifier__NSK",
241
+ "P1387": "political_alignment",
242
+ "P1415": "identifier__Oxforddnb",
243
+ "P1417": "identifier__Britannica",
244
+ "P1430": "identifier__OpenPlaques",
245
+ "P1438": "identifier__JewishEnc",
246
+ "P1469": "identifier__FIFA",
247
+ "P1556": "identifier__zbMATH",
248
+ "P1581": "weblog",
249
+ "P1607": "identifier__dialnet",
250
+ "P1615": "identifier__CLARA",
251
+ "P1617": "identifier__BBC_things",
252
+ "P1631": "identifier__ChinaVitae",
253
+ "P1648": "identifier__WelshBiography",
254
+ "P1649": "identifier__KMDb",
255
+ "P1650": "identifier__BBF",
256
+ "P1670": "identifier__LAC",
257
+ "P1695": "identifier__NLP",
258
+ "P1710": "identifier__saebi",
259
+ "P1711": "identifier__britishmuseum",
260
+ "P1713": "identifier__bundestag",
261
+ "P1714": "identifier__journalisted",
262
+ "P1741": "identifier__GTAA",
263
+ "P1749": "identifier__parlement",
264
+ "P1808": "identifier__senatDOTfr",
265
+ "P1814": "name__kana",
266
+ "P1816": "identifier__NPG",
267
+ "P1839": "identifier__FEC",
268
+ "P1871": "identifier__CERL",
269
+ "P1883": "identifier__declarator",
270
+ "P1890": "identifier__BNC",
271
+ "P1907": "identifier__AuDB",
272
+ "P1938": "identifier__Gutenberg",
273
+ "P1946": "identifier__N6I",
274
+ "P1969": "identifier__moviemeter",
275
+ "P1950": "second_surname",
276
+ "P1953": "identifier__discogs",
277
+ "P1959": "identifier__senate_nl",
278
+ "P1960": "identifier__google_scholar",
279
+ "P1980": "identifier__polsys",
280
+ "P1982": "identifier__anime_news",
281
+ "P1986": "identifier__treccani",
282
+ "P1996": "identifier__parliamentDOTuk",
283
+ "P2002": "twitter",
284
+ "P2003": "instagram",
285
+ "P2005": "identifier__halensis",
286
+ "P2013": "facebook",
287
+ "P2015": "identifier__hansard",
288
+ "P2019": "identifier__teuchos",
289
+ "P2019": "identifier__allmovie",
290
+ "P2029": "identifier__DoUB",
291
+ "P2035": "linkedin",
292
+ "P2163": "identifier__FAST",
293
+ "P2168": "identifier__SFDb",
294
+ "P2169": "identifier__publicwhip",
295
+ "P2170": "identifier__current_hansard",
296
+ "P2171": "identifier__TWFY",
297
+ "P2172": "identifier__parliamentary_record",
298
+ "P2173": "identifier__bbc_democracy_live",
299
+ "P2180": "identifier__kansallisbiografia",
300
+ "P2181": "identifier__eduskunta",
301
+ "P2182": "identifier__valtioneuvostosta",
302
+ "P2188": "identifier__BiblioNet",
303
+ "P2190": "identifier__CSPAN",
304
+ "P2255": "identifier__debretts_today",
305
+ "P2267": "identifier__politifact",
306
+ "P2273": "identifier__heidelberg",
307
+ "P2277": "identifier__magdeburg",
308
+ "P2278": "identifier__hellenic_parliament",
309
+ "P2280": "identifier__parlaments_at",
310
+ "P2342": "identifier__AGORHA",
311
+ "P2372": "identifier__ODIS",
312
+ "P2383": "identifier__CTHS",
313
+ "P2387": "identifier__elonet",
314
+ "P2390": "identifier__ballotpedia",
315
+ "P2397": "identifier__youtube",
316
+ "P2435": "identifier__PORT",
317
+ "P2487": "identifier__google_plus",
318
+ "P2492": "identifier__MTMT",
319
+ "P2519": "identifier__SCOPE",
320
+ "P2558": "identifier__autores_uy",
321
+ "P2562": "married_name",
322
+ "P2604": "identifier__kinopoisk",
323
+ "P2605": "identifier__CSFD",
324
+ "P2611": "identifier__TED",
325
+ "P2626": "identifier__DNF",
326
+ "P2639": "identifier__filmportal_de",
327
+ "P2671": "identifier__google_knowledge",
328
+ "P2686": "identifier__opensecrets",
329
+ "P2732": "identifier__persee",
330
+ "P2736": "identifier__FJC_bio",
331
+ "P2799": "identifier__BVMC",
332
+ "P2847": "identifier__google_plus",
333
+ "P2924": "identifier__bigenc_ru",
334
+ "P2949": "identifier__wikitree",
335
+ "P2953": "identifier__etis_ee",
336
+ "P2963": "identifier__goodreads",
337
+ "P3029": "identifier__national_archives",
338
+ "P3072": "identifier__storting",
339
+ "P3106": "identifier__guardian_topic",
340
+ "P3124": "identifier__polish_scientist",
341
+ "P3185": "identifier__vkontakte",
342
+ "P3188": "identifier__nobel_prize",
343
+ "P3221": "identifier__nyt_topic",
344
+ "P3222": "identifier__ne_se",
345
+ "P3258": "identifier__livejournal",
346
+ "P3267": "identifier__flickr",
347
+ "P3297": "identifier__flemish_parliament",
348
+ "P3298": "identifier__belgian_senate",
349
+ "P3344": "identifier__votesmart",
350
+ "P3348": "identifier__nlg",
351
+ "P3391": "identifier__verkhovna_rada",
352
+ "P3417": "identifier__quora",
353
+ "P3430": "identifier__snac",
354
+ "P3479": "identifier__omni"
355
+ }
356
+ }