reality 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +77 -0
- data/bin/reality +7 -0
- data/data/countries.yaml +254 -0
- data/data/country_lists.txt +445 -0
- data/data/country_orgs.yaml +72 -0
- data/data/infoboxes.txt +8804 -0
- data/data/infoboxes_freq.txt +3201 -0
- data/data/infoboxes_freq_sorted.txt +3117 -0
- data/examples/all_countries.rb +16 -0
- data/lib/reality.rb +15 -0
- data/lib/reality/country.rb +283 -0
- data/lib/reality/infoboxer_templates.rb +11 -0
- data/lib/reality/measure.rb +92 -0
- data/lib/reality/measure/unit.rb +120 -0
- data/reality.gemspec +37 -0
- data/script/extract_all_infoboxes.rb +16 -0
- data/script/extract_countries.rb +12 -0
- data/script/extract_country_categories.rb +16 -0
- data/script/extract_infobox_frequency.rb +27 -0
- data/script/lib/faraday_naive_cache.rb +40 -0
- data/script/out/categories.txt +619 -0
- metadata +138 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
$:.unshift 'lib'
|
5
|
+
require 'reality'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
FileUtils.mkdir_p 'examples/output'
|
10
|
+
|
11
|
+
start = Time.now
|
12
|
+
|
13
|
+
File.write 'examples/output/countries.yaml',
|
14
|
+
Reality.countries.to_a.sort_by(&:name).map(&:to_h).to_yaml
|
15
|
+
|
16
|
+
puts "Finished in %i seconds" % (Time.now - start)
|
data/lib/reality.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'infoboxer'
|
2
|
+
|
3
|
+
module Reality
|
4
|
+
require_relative 'reality/infoboxer_templates'
|
5
|
+
|
6
|
+
# basic functionality
|
7
|
+
%w[measure].each do |mod|
|
8
|
+
require_relative "reality/#{mod}"
|
9
|
+
end
|
10
|
+
|
11
|
+
# entities
|
12
|
+
%w[country].each do |mod|
|
13
|
+
require_relative "reality/#{mod}"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,283 @@
|
|
1
|
+
# NB: all of this is early drafts, so may look naive and sub-optimal.
|
2
|
+
# Just stay tuned!
|
3
|
+
|
4
|
+
module Reality
|
5
|
+
class Country
|
6
|
+
class List
|
7
|
+
def initialize(*names)
|
8
|
+
@names = names
|
9
|
+
end
|
10
|
+
|
11
|
+
def count
|
12
|
+
@names.count
|
13
|
+
end
|
14
|
+
|
15
|
+
def first(n = nil)
|
16
|
+
res = get(*@names.first(n || 1))
|
17
|
+
n ? res : res.first
|
18
|
+
end
|
19
|
+
|
20
|
+
def last(n = nil)
|
21
|
+
res = get(*@names.last(n || 1))
|
22
|
+
n ? res : res.first
|
23
|
+
end
|
24
|
+
|
25
|
+
def sample(n = nil)
|
26
|
+
res = get(*@names.sample(n || 1))
|
27
|
+
n ? res : res.first
|
28
|
+
end
|
29
|
+
|
30
|
+
def each(&block)
|
31
|
+
@pages = get(*@names)
|
32
|
+
@pages.each(&block)
|
33
|
+
end
|
34
|
+
|
35
|
+
include Enumerable
|
36
|
+
|
37
|
+
def to_a
|
38
|
+
get(*@names)
|
39
|
+
end
|
40
|
+
|
41
|
+
def where(**filters)
|
42
|
+
names = @names & Reality::Country.
|
43
|
+
by_continents.
|
44
|
+
select{|k, v| v == filters[:continent]}.
|
45
|
+
map(&:first)
|
46
|
+
|
47
|
+
self.class.new(*names)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def get(*names)
|
53
|
+
Reality.wp.get(*names).map{|page| Country.new(page)}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def initialize(page)
|
58
|
+
@page = page
|
59
|
+
end
|
60
|
+
|
61
|
+
def name
|
62
|
+
page.title
|
63
|
+
end
|
64
|
+
|
65
|
+
def long_name
|
66
|
+
infobox.fetch('conventional_long_name').text.strip
|
67
|
+
end
|
68
|
+
|
69
|
+
def capital
|
70
|
+
infobox.fetch('capital').lookup(:Wikilink).first
|
71
|
+
end
|
72
|
+
|
73
|
+
def languages
|
74
|
+
[
|
75
|
+
['Official', infobox_links('official_languages')],
|
76
|
+
[infobox.fetch('languages_type').text.sub(/ languages?$/, ''), infobox_links('languages')]
|
77
|
+
].reject{|k, v| k.empty? || v.empty?}.to_h
|
78
|
+
end
|
79
|
+
|
80
|
+
def tld
|
81
|
+
tlds.first
|
82
|
+
end
|
83
|
+
|
84
|
+
def tlds
|
85
|
+
infobox_links('cctld').map(&:link)
|
86
|
+
end
|
87
|
+
|
88
|
+
def calling_code
|
89
|
+
infobox.fetch('calling_code').text.strip
|
90
|
+
end
|
91
|
+
|
92
|
+
def utc_offset
|
93
|
+
infobox.fetch('utc_offset').text.sub('−', '-').to_i
|
94
|
+
end
|
95
|
+
|
96
|
+
def currency
|
97
|
+
currencies.first
|
98
|
+
end
|
99
|
+
|
100
|
+
def currencies
|
101
|
+
infobox_links('currency').reject{|l| l.link == 'ISO 4217'}
|
102
|
+
end
|
103
|
+
|
104
|
+
def area
|
105
|
+
Reality::Measure(infobox.fetch('area_km2').text.gsub(',', '').to_i, 'km²')
|
106
|
+
end
|
107
|
+
|
108
|
+
def population
|
109
|
+
val = %w[population_estimate population_census].map{|var|
|
110
|
+
infobox.fetch(var).text.strip
|
111
|
+
}.reject(&:empty?).first
|
112
|
+
val && Reality::Measure(parse_maybe_scaled(val), 'person')
|
113
|
+
end
|
114
|
+
|
115
|
+
def gdp_ppp
|
116
|
+
val = infobox.fetch('GDP_PPP').text.strip.sub(/^((Int|US)?\$|USD)/, '')
|
117
|
+
val.empty? ? nil : Reality::Measure(parse_scaled(val), '$')
|
118
|
+
end
|
119
|
+
|
120
|
+
def gdp_nominal
|
121
|
+
val = infobox.fetch('GDP_nominal').text.strip.sub(/^((Int|US)?\$|USD)/, '')
|
122
|
+
val.empty? ? nil : Reality::Measure(parse_scaled(val), '$')
|
123
|
+
end
|
124
|
+
|
125
|
+
alias_method :gdp, :gdp_nominal
|
126
|
+
|
127
|
+
def leaders
|
128
|
+
titles = infobox.fetch(/^leader_title\d/).map(&:text_)
|
129
|
+
names = infobox.fetch(/^leader_name\d/).map{|v| v.lookup(:Wikilink).first}
|
130
|
+
titles.zip(names).to_h
|
131
|
+
end
|
132
|
+
|
133
|
+
def continent
|
134
|
+
self.class.by_continents[page.title]
|
135
|
+
end
|
136
|
+
|
137
|
+
def organizations
|
138
|
+
organizations_list.map{|o| o[:name]}
|
139
|
+
end
|
140
|
+
|
141
|
+
def member_of?(org)
|
142
|
+
organizations_list.any?{|o| o[:name] == org || o[:abbr] == org}
|
143
|
+
end
|
144
|
+
|
145
|
+
def to_s
|
146
|
+
name
|
147
|
+
end
|
148
|
+
|
149
|
+
def inspect
|
150
|
+
"#<#{self.class}(#{name})>"
|
151
|
+
end
|
152
|
+
|
153
|
+
PROPERTIES = %i[
|
154
|
+
continent name long_name
|
155
|
+
tld tlds calling_code utc_offset
|
156
|
+
capital languages currency
|
157
|
+
leaders area population
|
158
|
+
gdp_ppp gdp_nominal
|
159
|
+
]
|
160
|
+
|
161
|
+
def to_h
|
162
|
+
#p self
|
163
|
+
PROPERTIES.
|
164
|
+
map{|prop| [prop, to_simple_type(send(prop))] }.
|
165
|
+
#reject{|prop, val| !val || val.respond_to?(:empty?) && val.empty?}.
|
166
|
+
to_h
|
167
|
+
end
|
168
|
+
|
169
|
+
class << self
|
170
|
+
def by_continents
|
171
|
+
@by_continents ||= Reality.wp.
|
172
|
+
get('List of countries by continent').
|
173
|
+
sections.first.
|
174
|
+
sections.map{|s|
|
175
|
+
continent = s.heading.text_
|
176
|
+
s.tables.first.
|
177
|
+
lookup(:Wikilink, :bold?).map(&:link).
|
178
|
+
map{|country| [country, continent]}
|
179
|
+
}.flatten(1).
|
180
|
+
to_h
|
181
|
+
end
|
182
|
+
|
183
|
+
def organizations
|
184
|
+
@organizations ||= YAML.load(File.read(File.expand_path('../../../data/country_orgs.yaml', __FILE__)))
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
private
|
189
|
+
|
190
|
+
attr_reader :page
|
191
|
+
|
192
|
+
def infobox
|
193
|
+
page.infobox
|
194
|
+
end
|
195
|
+
|
196
|
+
def organizations_list
|
197
|
+
catnames = page.categories.map(&:name)
|
198
|
+
self.class.organizations.select{|o| catnames.include?(o[:category])}
|
199
|
+
end
|
200
|
+
|
201
|
+
def infobox_links(varname)
|
202
|
+
src = infobox.fetch(varname)
|
203
|
+
if tmpl = src.lookup(:Template, name: /list$/).first
|
204
|
+
# values could be both inside and outside list, see India's cctld value
|
205
|
+
src = Infoboxer::Tree::Nodes[src, tmpl.variables]
|
206
|
+
end
|
207
|
+
src.lookup(:Wikilink).uniq
|
208
|
+
end
|
209
|
+
|
210
|
+
# See "Short scale": https://en.wikipedia.org/wiki/Long_and_short_scales#Comparison
|
211
|
+
SCALES = {
|
212
|
+
'million' => 1_000_000,
|
213
|
+
'billion' => 1_000_000_000,
|
214
|
+
'trillion' => 1_000_000_000_000,
|
215
|
+
'quadrillion' => 1_000_000_000_000_000,
|
216
|
+
'quintillion' => 1_000_000_000_000_000_000,
|
217
|
+
'sextillion' => 1_000_000_000_000_000_000_000,
|
218
|
+
'septillion' => 1_000_000_000_000_000_000_000_000,
|
219
|
+
}
|
220
|
+
SCALES_REGEXP = Regexp.union(*SCALES.keys)
|
221
|
+
|
222
|
+
def parse_scaled(str)
|
223
|
+
match, amount, scale = */^([0-9.,]+)[[:space:]]*(#{SCALES_REGEXP})/.match(str)
|
224
|
+
match or
|
225
|
+
fail(ArgumentError, "Unparseable scaled value #{str} for #{self}")
|
226
|
+
|
227
|
+
(amount.gsub(/[,]/, '').to_f * fetch_scale(scale)).to_i
|
228
|
+
end
|
229
|
+
|
230
|
+
def parse_maybe_scaled(str)
|
231
|
+
match, amount, scale = */^([0-9.,]+)[[:space:]]*(#{SCALES_REGEXP})?/.match(str)
|
232
|
+
match or
|
233
|
+
fail(ArgumentError, "Unparseable scaled value #{str} for #{self}")
|
234
|
+
|
235
|
+
if scale
|
236
|
+
(amount.gsub(/[,]/, '').to_f * fetch_scale(scale)).to_i
|
237
|
+
else
|
238
|
+
amount.gsub(/[,]/, '').to_i
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def fetch_scale(str)
|
243
|
+
_, res = SCALES.detect{|key, val| str.start_with?(key)}
|
244
|
+
|
245
|
+
res or fail("Scale not found: #{str} for #{self}")
|
246
|
+
end
|
247
|
+
|
248
|
+
def to_simple_type(val)
|
249
|
+
case val
|
250
|
+
when nil, Numeric, String, Symbol
|
251
|
+
val
|
252
|
+
when Array
|
253
|
+
val.map{|v| to_simple_type(v)}
|
254
|
+
when Hash
|
255
|
+
val.map{|k, v| [to_simple_type(k), to_simple_type(v)]}.to_h
|
256
|
+
when Infoboxer::Tree::Wikilink
|
257
|
+
val.link
|
258
|
+
when Infoboxer::Tree::Node
|
259
|
+
val.text_
|
260
|
+
when Reality::Measure
|
261
|
+
val.amount
|
262
|
+
else
|
263
|
+
fail ArgumentError, "Non-coercible value #{val.class}"
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def Reality.country(name)
|
269
|
+
page = wp.get(name) or return nil
|
270
|
+
# FIXME: not very reliable, as some fictional countries, aliances
|
271
|
+
# and country groups also have this infobox. Or maybe it is acceptable?..
|
272
|
+
page.templates(name: 'Infobox country').empty? ? nil : Country.new(page)
|
273
|
+
end
|
274
|
+
|
275
|
+
def Reality.countries(*names)
|
276
|
+
names = Country.by_continents.keys.sort if names.empty?
|
277
|
+
Country::List.new(*names)
|
278
|
+
end
|
279
|
+
|
280
|
+
def Reality.wp
|
281
|
+
@wp ||= Infoboxer.wp # while Infoboxer recreates wp for each request
|
282
|
+
end
|
283
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Reality
|
2
|
+
class Measure
|
3
|
+
%w[unit].each{|mod| require_relative "measure/#{mod}"}
|
4
|
+
|
5
|
+
attr_reader :amount, :unit
|
6
|
+
|
7
|
+
def initialize(amount, unit)
|
8
|
+
@amount, @unit = Rational(amount), Unit.parse(unit)
|
9
|
+
end
|
10
|
+
|
11
|
+
def <=>(other)
|
12
|
+
check_compatibility!(other)
|
13
|
+
|
14
|
+
amount <=> other.amount
|
15
|
+
end
|
16
|
+
|
17
|
+
def -@
|
18
|
+
self.class.new(-amount, unit)
|
19
|
+
end
|
20
|
+
|
21
|
+
def +(other)
|
22
|
+
check_compatibility!(other)
|
23
|
+
|
24
|
+
self.class.new(amount + other.amount, unit)
|
25
|
+
end
|
26
|
+
|
27
|
+
def -(other)
|
28
|
+
self + (-other)
|
29
|
+
end
|
30
|
+
|
31
|
+
def *(other)
|
32
|
+
case other
|
33
|
+
when Numeric
|
34
|
+
self.class.new(amount * other, unit)
|
35
|
+
when self.class
|
36
|
+
self.class.new(amount * other.amount, unit * other.unit)
|
37
|
+
else
|
38
|
+
fail ArgumentError, "Can't multiply by #{other.class}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def /(other)
|
43
|
+
case other
|
44
|
+
when Numeric
|
45
|
+
self.class.new(amount / other, unit)
|
46
|
+
when self.class
|
47
|
+
un = unit / other.unit
|
48
|
+
un.scalar? ?
|
49
|
+
amount / other.amount :
|
50
|
+
self.class.new(amount / other.amount, un)
|
51
|
+
else
|
52
|
+
fail ArgumentError, "Can't divide by #{other.class}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def **(num)
|
57
|
+
(num-1).times.inject(self){|res| res*self}
|
58
|
+
end
|
59
|
+
|
60
|
+
include Comparable
|
61
|
+
|
62
|
+
def to_s
|
63
|
+
'%s%s' % [formatted_amount, unit]
|
64
|
+
end
|
65
|
+
|
66
|
+
def inspect
|
67
|
+
"#<%s(%s %s)>" % [self.class, formatted_amount, unit]
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def formatted_amount
|
73
|
+
# FIXME: really naive
|
74
|
+
if amount.abs < 1
|
75
|
+
amount.to_f.to_s
|
76
|
+
else
|
77
|
+
# see http://stackoverflow.com/a/6460145/3683228
|
78
|
+
amount.to_i.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def check_compatibility!(other)
|
83
|
+
unless other.kind_of?(self.class) && other.unit == unit
|
84
|
+
fail ArgumentError, "#{self} incompatible with #{other}"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def Reality.Measure(*arg)
|
90
|
+
Measure.new(*arg)
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Reality
|
4
|
+
class Measure
|
5
|
+
class Unit
|
6
|
+
@unicode = true
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_accessor :unicode
|
10
|
+
|
11
|
+
UNIT_REGEX = /[a-zA-Z\$]+/ # FIXME: there are many non-ASCII units, especially in money
|
12
|
+
POWER_REGEX = /[²³]|\^(\d+)/
|
13
|
+
OP_REGEX = /[\/*·]/
|
14
|
+
|
15
|
+
def parse(str)
|
16
|
+
return str if str.kind_of?(Unit)
|
17
|
+
|
18
|
+
scanner = StringScanner.new(str)
|
19
|
+
denom = false
|
20
|
+
units = []
|
21
|
+
|
22
|
+
loop do
|
23
|
+
# (variable [power] operator) ....
|
24
|
+
unit = scanner.scan(UNIT_REGEX) or fail("Variable expected at #{scanner.rest}")
|
25
|
+
pow = scanner.scan(POWER_REGEX)
|
26
|
+
units << [unit, parse_pow(pow, denom)]
|
27
|
+
break if scanner.eos?
|
28
|
+
|
29
|
+
op = scanner.scan(OP_REGEX) or fail("Operator expected at #{scanner.rest}")
|
30
|
+
if op == '/'
|
31
|
+
denom and fail("Second division at #{scanner.rest}")
|
32
|
+
denom = true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
new(*units)
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_pow(p, denom)
|
39
|
+
res = case p
|
40
|
+
when nil then 1
|
41
|
+
when '²' then 2
|
42
|
+
when '³' then 3
|
43
|
+
when /^\^(\d+)$/ then $1.to_i
|
44
|
+
else fail(ArgumentError, "Can't parse power #{p}")
|
45
|
+
end
|
46
|
+
|
47
|
+
denom ? -res : res
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
attr_reader :components
|
52
|
+
|
53
|
+
def initialize(*components)
|
54
|
+
@components = components.
|
55
|
+
group_by{|sig, pow| sig}.
|
56
|
+
map{|sig, cmps| [sig, cmps.map(&:last).inject(:+)]}.
|
57
|
+
reject{|sig, pow| pow.zero?}
|
58
|
+
end
|
59
|
+
|
60
|
+
def ==(other)
|
61
|
+
other.class == self.class && other.components == self.components
|
62
|
+
end
|
63
|
+
|
64
|
+
def scalar?
|
65
|
+
components.empty?
|
66
|
+
end
|
67
|
+
|
68
|
+
def -@
|
69
|
+
self.class.new(*components.map{|sig, pow| [sig, -pow]})
|
70
|
+
end
|
71
|
+
|
72
|
+
def *(other)
|
73
|
+
other.class == self.class or
|
74
|
+
fail(TypeError, "Can't multiply #{self.class} by #{other.class}")
|
75
|
+
|
76
|
+
self.class.new(*components, *other.components)
|
77
|
+
end
|
78
|
+
|
79
|
+
def /(other)
|
80
|
+
other.class == self.class or
|
81
|
+
fail(TypeError, "Can't divide #{self.class} by #{other.class}")
|
82
|
+
|
83
|
+
self * -other
|
84
|
+
end
|
85
|
+
|
86
|
+
def to_s
|
87
|
+
num, denom = components.partition{|sig, pow| pow > 0}
|
88
|
+
numerator = num.map{|sig, pow| "#{sig}#{power(pow)}"}.join(mul)
|
89
|
+
denominator = denom.map{|sig, pow| "#{sig}#{power(pow)}"}.join(mul)
|
90
|
+
case
|
91
|
+
when numerator.empty?
|
92
|
+
[1, denominator].join('/')
|
93
|
+
when denominator.empty?
|
94
|
+
numerator
|
95
|
+
else
|
96
|
+
[numerator, denominator].join('/')
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
UNICODE_SUPER = {2 => '²', 3 => '³'}
|
103
|
+
|
104
|
+
def mul
|
105
|
+
self.class.unicode ? '·' : '*'
|
106
|
+
end
|
107
|
+
|
108
|
+
def power(num)
|
109
|
+
num = num.abs
|
110
|
+
case num
|
111
|
+
when 0 then fail(ArgumentError, "0-power unit!")
|
112
|
+
when 1 then ''
|
113
|
+
when 2..3
|
114
|
+
self.class.unicode ? UNICODE_SUPER.fetch(num) : "^#{num}"
|
115
|
+
else "^#{num}"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|