wikidata-fetcher 0.19.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,117 @@
1
+ require 'json'
2
+ require 'open-uri'
3
+ require 'require_all'
4
+ require 'wikisnakker'
5
+
6
+ require_rel '..'
7
+
8
+ class WikiData
9
+ class Fetcher < WikiData
10
+ LOOKUP_FILE = 'https://raw.githubusercontent.com/everypolitician/wikidata-fetcher/master/lookup.json'.freeze
11
+
12
+ def self.find(ids)
13
+ Hash[Wikisnakker::Item.find(ids).map { |wditem| [wditem.id, new(item: wditem)] }]
14
+ end
15
+
16
+ def self.wikidata_properties
17
+ @wikidata_properties ||= JSON.parse(open(LOOKUP_FILE).read, symbolize_names: true)
18
+ end
19
+
20
+ def initialize(h)
21
+ if h[:id]
22
+ @item = Wikisnakker::Item.find(h[:id]) or raise "No such item #{h[:id]}"
23
+ @id = @item.id or raise "No ID for #{h[:id]} = #{@item}"
24
+ warn "Different ID (#{@id}) for #{h[:id]}" if @id != h[:id]
25
+ elsif h[:item]
26
+ # Already have a Wikisnakker::Item, eg from a bulk lookup
27
+ @item = h[:item]
28
+ @id = @item.id or raise "No ID for #{h[:id]} = #{@item}"
29
+ else
30
+ raise 'No id'
31
+ end
32
+ end
33
+
34
+ def data(*lang)
35
+ return unless item
36
+
37
+ data = {
38
+ id: id,
39
+ name: first_label_used(lang | ['en']),
40
+ }.merge(labels).merge(wikipedia_links)
41
+
42
+ # Short-circuit if there are no claims
43
+ return data if item.properties.empty?
44
+
45
+ # Short-circuit if this is not a human
46
+ unless human?
47
+ warn "‼ #{id} is_instance_of #{type.join(' & ')}. Skipping"
48
+ return nil
49
+ end
50
+
51
+ unknown_properties.each do |p|
52
+ puts "⁇ Unknown property: https://www.wikidata.org/wiki/Property:#{p} for #{id}"
53
+ end
54
+
55
+ wanted_properties.each do |p|
56
+ val = property_value(p)
57
+ next warn "Unknown value for #{p} for #{id}" unless val
58
+ data[want[p].to_sym] = val
59
+ end
60
+
61
+ data
62
+ end
63
+
64
+ private
65
+
66
+ attr_reader :item, :id
67
+
68
+ def skip
69
+ @skip ||= self.class.wikidata_properties[:skip]
70
+ end
71
+
72
+ def want
73
+ @want ||= self.class.wikidata_properties[:want]
74
+ end
75
+
76
+ def type
77
+ item.P31s.map { |p| p.value.label('en') }
78
+ end
79
+
80
+ def human?
81
+ type.include? 'human'
82
+ end
83
+
84
+ def unknown_properties
85
+ item.properties.reject { |c| skip[c] || want[c] }
86
+ end
87
+
88
+ def wanted_properties
89
+ item.properties & want.keys
90
+ end
91
+
92
+ def labels
93
+ # remove any bracketed element at the end
94
+ Hash[item.labels.map do |k, v|
95
+ ["name__#{k.to_s.tr('-', '_')}".to_sym, v[:value].sub(/ \(.*?\)$/, '')]
96
+ end]
97
+ end
98
+
99
+ def wikipedia_links
100
+ Hash[item.sitelinks.map do |k, v|
101
+ ["wikipedia__#{k.to_s.sub(/wiki$/, '')}".to_sym, v.title]
102
+ end]
103
+ end
104
+
105
+ def property_value(property)
106
+ val = item[property].value rescue nil or return
107
+ return val unless val.respond_to?(:label)
108
+ return unless val.labels
109
+ val.label('en')
110
+ end
111
+
112
+ def first_label_used(language_codes)
113
+ prefered = (item.labels.keys & language_codes.map(&:to_sym)).first or return
114
+ item.labels[prefered][:value]
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,5 @@
1
+ module Wikidata
2
+ module Fetcher
3
+ VERSION = '0.19.1'.freeze
4
+ end
5
+ end
@@ -0,0 +1,356 @@
1
+ {
2
+ "skip": {
3
+ "P7": "Brother",
4
+ "P9": "Sister",
5
+ "P10": "video",
6
+ "P17": "Country",
7
+ "P19": "Place of Birth",
8
+ "P20": "Place of Death",
9
+ "P22": "Father",
10
+ "P25": "Mother",
11
+ "P26": "Spouse",
12
+ "P27": "Country of Citizenship",
13
+ "P31": "Instance of",
14
+ "P39": "Position Held",
15
+ "P40": "Child",
16
+ "P43": "Stepfather",
17
+ "P51": "audio",
18
+ "P53": "noble family",
19
+ "P54": "Member of sports team",
20
+ "P66": "Ancestral home",
21
+ "P69": "Educated at",
22
+ "P91": "Sexual orientation",
23
+ "P94": "coat of arms image",
24
+ "P101": "Field of Work",
25
+ "P103": "Native language",
26
+ "P102": "Party",
27
+ "P108": "Employer",
28
+ "P109": "Signature",
29
+ "P119": "Place of burial",
30
+ "P135": "movement",
31
+ "P136": "genre",
32
+ "P138": "named after",
33
+ "P140": "Religion",
34
+ "P155": "follows",
35
+ "P156": "followed by",
36
+ "P157": "killed by",
37
+ "P166": "Award received",
38
+ "P172": "Ethnic group",
39
+ "P184": "Doctoral advisor",
40
+ "P241": "Military branch",
41
+ "P361": "party of",
42
+ "P373": "Commons category",
43
+ "P410": "Military rank",
44
+ "P412": "voice type",
45
+ "P413": "position on team",
46
+ "P425": "field of this profession",
47
+ "P428": "Botanist author",
48
+ "P443": "Pronunciation audio",
49
+ "P450": "Astronaut mission",
50
+ "P451": "Cohabitant",
51
+ "P463": "Member of",
52
+ "P485": "Archives at",
53
+ "P488": "Chairperson",
54
+ "P495": "Country of origin",
55
+ "P509": "Cause of death",
56
+ "P512": "Academic degree",
57
+ "P535": "Find a Grave",
58
+ "P551": "Residence",
59
+ "P552": "handedness",
60
+ "P555": "Tennis doubles record",
61
+ "P564": "Tennis singles record",
62
+ "P598": "Commander of",
63
+ "P607": "Conflicts",
64
+ "P641": "Sport",
65
+ "P647": "drafted by",
66
+ "P650": "RKDartists",
67
+ "P737": "influenced by",
68
+ "P741": "tennis playing hand",
69
+ "P793": "significant event",
70
+ "P800": "Notable work",
71
+ "P802": "student",
72
+ "P812": "Academic major",
73
+ "P866": "Perlentaucher ID",
74
+ "P898": "IPA",
75
+ "P900": "<deleted>",
76
+ "P910": "Main category",
77
+ "P935": "Commons gallery",
78
+ "P937": "Work location",
79
+ "P941": "Inspired by",
80
+ "P948": "Wikivoyage banner",
81
+ "P990": "voice recording",
82
+ "P1019": "feed URL",
83
+ "P1026": "doctoral thesis",
84
+ "P1038": "Relative",
85
+ "P1050": "Medical condition",
86
+ "P1066": "Student of",
87
+ "P1087": "Elo rating",
88
+ "P1151": "Wikimedia portal",
89
+ "P1185": "Rodovid ID",
90
+ "P1196": "Manner of death",
91
+ "P1220": "Broadway Database ID",
92
+ "P1233": "Speculative fiction DB",
93
+ "P1317": "floruit",
94
+ "P1303": "instrument played",
95
+ "P1321": "Place of Origin (Swiss)",
96
+ "P1343": "Described by source",
97
+ "P1344": "Participant in",
98
+ "P1399": "Convicted of",
99
+ "P1409": "Cycling Archives ID",
100
+ "P1411": "nominated for",
101
+ "P1412": "Languages",
102
+ "P1416": "affiliation",
103
+ "P1429": "pet",
104
+ "P1440": "Identifier FIDE",
105
+ "P1442": "Image of grave",
106
+ "P1447": "SportsReference ID",
107
+ "P1448": "Official name",
108
+ "P1449": "nickname",
109
+ "P1472": "Commons Creator page",
110
+ "P1477": "birth_name",
111
+ "P1532": "Country for sport",
112
+ "P1559": "Name in native language",
113
+ "P1563": "MacTutor id",
114
+ "P1576": "lifestyle",
115
+ "P1618": "sport number",
116
+ "P1665": "Chess Games ID",
117
+ "P1683": "quote",
118
+ "P1728": "AllMusic ID",
119
+ "P1801": "commemorative plaque",
120
+ "P1819": "genealogics ID",
121
+ "P1825": "Baseball-Reference.com ID",
122
+ "P1826": "Baseball-Reference.com ID",
123
+ "P1853": "blood type",
124
+ "P1889": "different from",
125
+ "P1967": "boxrec ID",
126
+ "P1971": "Number of children",
127
+ "P2020": "worldfootball.net",
128
+ "P2021": "Erdős number",
129
+ "P2042": "Artsy ID",
130
+ "P2048": "height",
131
+ "P2067": "mass",
132
+ "P2091": "FISA ID",
133
+ "P2174": "MOMA ID",
134
+ "P2193": "Soccerbase ID",
135
+ "P2276": "UEFA ID",
136
+ "P2350": "Speedskatingbase.eu ID",
137
+ "P2416": "sports discipline",
138
+ "P2423": "FIE fencer ID",
139
+ "P2446": "transfermarkt player ID",
140
+ "P2447": "transfermarkt manager ID",
141
+ "P2448": "Turkish Football Federation player ID",
142
+ "P2449": "Turkish Football Federation manager ID",
143
+ "P2458": "Mackolik ID",
144
+ "P2574": "NFT ID",
145
+ "P2593": "Latvian Olympic athlete ID",
146
+ "P2632": "place of detention",
147
+ "P2640": "SwimRankings ID",
148
+ "P2696": "FIG gymnast ID",
149
+ "P2727": "United World Wrestling ID",
150
+ "P2728": "CageMatch ID",
151
+ "P2767": "JudoInside ID",
152
+ "P2773": "FIS cross-country skier ID",
153
+ "P2779": "IAT weightlifter ID",
154
+ "P2796": "FIG gymnast ID",
155
+ "P2818": "Sherdog ID",
156
+ "P3065": "identifier__rero",
157
+ "P3172": "World Bridge Federation ID",
158
+ "P3373": "sibling"
159
+ },
160
+ "want": {
161
+ "P18": "image",
162
+ "P21": "gender",
163
+ "P97": "noble_title",
164
+ "P106": "occupation",
165
+ "P213": "identifier__ISNI",
166
+ "P214": "identifier__VIAF",
167
+ "P227": "identifier__GND",
168
+ "P244": "identifier__LCAuth",
169
+ "P245": "identifier__ULAN",
170
+ "P268": "identifier__BNF",
171
+ "P269": "identifier__SUDOC",
172
+ "P271": "identifier__CiNii",
173
+ "P345": "identifier__IMDB",
174
+ "P349": "identifier__NDL",
175
+ "P396": "identifier__SBN_it",
176
+ "P409": "identifier__NLA",
177
+ "P434": "identifier__MusicBrainz",
178
+ "P496": "identifier__ORCID",
179
+ "P511": "honorific_prefix",
180
+ "P513": "birth_name",
181
+ "P536": "identifier__ATP",
182
+ "P549": "identifier__MGP",
183
+ "P553": "website",
184
+ "P569": "birth_date",
185
+ "P570": "death_date",
186
+ "P599": "identifier__ITF",
187
+ "P640": "identifier__leonore",
188
+ "P646": "identifier__freebase",
189
+ "P648": "identifier__OLID",
190
+ "P651": "identifier__BPN",
191
+ "P691": "identifier__NKC",
192
+ "P723": "identifier__DBNL",
193
+ "P734": "family_name",
194
+ "P735": "given_name",
195
+ "P742": "pseudonym",
196
+ "P768": "electoral_district",
197
+ "P856": "website",
198
+ "P865": "identifier__BMLO",
199
+ "P902": "identifier__HDS",
200
+ "P906": "identifier__SELIBR",
201
+ "P947": "identifier__RSL",
202
+ "P949": "identifier__NLI",
203
+ "P950": "identifier__BNE",
204
+ "P951": "identifier__NSZL",
205
+ "P968": "email",
206
+ "P998": "identifier__dmoz",
207
+ "P1005": "identifier__PTBNP",
208
+ "P1006": "identifier__NTA",
209
+ "P1015": "identifier__BIBSYS",
210
+ "P1025": "identifier__SUDOC",
211
+ "P1017": "identifier__BAV",
212
+ "P1035": "honorific_suffix",
213
+ "P1045": "identifier__sycomore",
214
+ "P1047": "identifier__catholic_hierarchy",
215
+ "P1048": "identifier__NCL",
216
+ "P1146": "identifier__IIAF",
217
+ "P1157": "identifier__UScongress",
218
+ "P1186": "identifier__EuroparlMEP",
219
+ "P1207": "identifier__NUKAT",
220
+ "P1213": "identifier__NLC",
221
+ "P1214": "identifier__Riksdagen",
222
+ "P1229": "identifier__openpolis",
223
+ "P1258": "identifier__rotten_tomatoes",
224
+ "P1263": "identifier__NNDB",
225
+ "P1266": "identifier__AlloCine",
226
+ "P1273": "identifier__CANTIC",
227
+ "P1280": "identifier__CONOR",
228
+ "P1284": "identifier__Munzinger",
229
+ "P1285": "identifier__Munzinger",
230
+ "P1288": "identifier__Munzinger",
231
+ "P1289": "identifier__KLfG",
232
+ "P1291": "identifier__ADS",
233
+ "P1296": "identifier__GNC",
234
+ "P1307": "identifier__parlamentDOTch",
235
+ "P1309": "identifier__EGAXA",
236
+ "P1315": "identifier__NLAtrove",
237
+ "P1331": "identifier__PACE",
238
+ "P1341": "identifier__italian_cod",
239
+ "P1368": "identifier__LNB",
240
+ "P1375": "identifier__NSK",
241
+ "P1387": "political_alignment",
242
+ "P1415": "identifier__Oxforddnb",
243
+ "P1417": "identifier__Britannica",
244
+ "P1430": "identifier__OpenPlaques",
245
+ "P1438": "identifier__JewishEnc",
246
+ "P1469": "identifier__FIFA",
247
+ "P1556": "identifier__zbMATH",
248
+ "P1581": "weblog",
249
+ "P1607": "identifier__dialnet",
250
+ "P1615": "identifier__CLARA",
251
+ "P1617": "identifier__BBC_things",
252
+ "P1631": "identifier__ChinaVitae",
253
+ "P1648": "identifier__WelshBiography",
254
+ "P1649": "identifier__KMDb",
255
+ "P1650": "identifier__BBF",
256
+ "P1670": "identifier__LAC",
257
+ "P1695": "identifier__NLP",
258
+ "P1710": "identifier__saebi",
259
+ "P1711": "identifier__britishmuseum",
260
+ "P1713": "identifier__bundestag",
261
+ "P1714": "identifier__journalisted",
262
+ "P1741": "identifier__GTAA",
263
+ "P1749": "identifier__parlement",
264
+ "P1808": "identifier__senatDOTfr",
265
+ "P1814": "name__kana",
266
+ "P1816": "identifier__NPG",
267
+ "P1839": "identifier__FEC",
268
+ "P1871": "identifier__CERL",
269
+ "P1883": "identifier__declarator",
270
+ "P1890": "identifier__BNC",
271
+ "P1907": "identifier__AuDB",
272
+ "P1938": "identifier__Gutenberg",
273
+ "P1946": "identifier__N6I",
274
+ "P1969": "identifier__moviemeter",
275
+ "P1950": "second_surname",
276
+ "P1953": "identifier__discogs",
277
+ "P1959": "identifier__senate_nl",
278
+ "P1960": "identifier__google_scholar",
279
+ "P1980": "identifier__polsys",
280
+ "P1982": "identifier__anime_news",
281
+ "P1986": "identifier__treccani",
282
+ "P1996": "identifier__parliamentDOTuk",
283
+ "P2002": "twitter",
284
+ "P2003": "instagram",
285
+ "P2005": "identifier__halensis",
286
+ "P2013": "facebook",
287
+ "P2015": "identifier__hansard",
288
+ "P2019": "identifier__teuchos",
289
+ "P2019": "identifier__allmovie",
290
+ "P2029": "identifier__DoUB",
291
+ "P2035": "linkedin",
292
+ "P2163": "identifier__FAST",
293
+ "P2168": "identifier__SFDb",
294
+ "P2169": "identifier__publicwhip",
295
+ "P2170": "identifier__current_hansard",
296
+ "P2171": "identifier__TWFY",
297
+ "P2172": "identifier__parliamentary_record",
298
+ "P2173": "identifier__bbc_democracy_live",
299
+ "P2180": "identifier__kansallisbiografia",
300
+ "P2181": "identifier__eduskunta",
301
+ "P2182": "identifier__valtioneuvostosta",
302
+ "P2188": "identifier__BiblioNet",
303
+ "P2190": "identifier__CSPAN",
304
+ "P2255": "identifier__debretts_today",
305
+ "P2267": "identifier__politifact",
306
+ "P2273": "identifier__heidelberg",
307
+ "P2277": "identifier__magdeburg",
308
+ "P2278": "identifier__hellenic_parliament",
309
+ "P2280": "identifier__parlaments_at",
310
+ "P2342": "identifier__AGORHA",
311
+ "P2372": "identifier__ODIS",
312
+ "P2383": "identifier__CTHS",
313
+ "P2387": "identifier__elonet",
314
+ "P2390": "identifier__ballotpedia",
315
+ "P2397": "identifier__youtube",
316
+ "P2435": "identifier__PORT",
317
+ "P2487": "identifier__google_plus",
318
+ "P2492": "identifier__MTMT",
319
+ "P2519": "identifier__SCOPE",
320
+ "P2558": "identifier__autores_uy",
321
+ "P2562": "married_name",
322
+ "P2604": "identifier__kinopoisk",
323
+ "P2605": "identifier__CSFD",
324
+ "P2611": "identifier__TED",
325
+ "P2626": "identifier__DNF",
326
+ "P2639": "identifier__filmportal_de",
327
+ "P2671": "identifier__google_knowledge",
328
+ "P2686": "identifier__opensecrets",
329
+ "P2732": "identifier__persee",
330
+ "P2736": "identifier__FJC_bio",
331
+ "P2799": "identifier__BVMC",
332
+ "P2847": "identifier__google_plus",
333
+ "P2924": "identifier__bigenc_ru",
334
+ "P2949": "identifier__wikitree",
335
+ "P2953": "identifier__etis_ee",
336
+ "P2963": "identifier__goodreads",
337
+ "P3029": "identifier__national_archives",
338
+ "P3072": "identifier__storting",
339
+ "P3106": "identifier__guardian_topic",
340
+ "P3124": "identifier__polish_scientist",
341
+ "P3185": "identifier__vkontakte",
342
+ "P3188": "identifier__nobel_prize",
343
+ "P3221": "identifier__nyt_topic",
344
+ "P3222": "identifier__ne_se",
345
+ "P3258": "identifier__livejournal",
346
+ "P3267": "identifier__flickr",
347
+ "P3297": "identifier__flemish_parliament",
348
+ "P3298": "identifier__belgian_senate",
349
+ "P3344": "identifier__votesmart",
350
+ "P3348": "identifier__nlg",
351
+ "P3391": "identifier__verkhovna_rada",
352
+ "P3417": "identifier__quora",
353
+ "P3430": "identifier__snac",
354
+ "P3479": "identifier__omni"
355
+ }
356
+ }