reality 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +77 -0
- data/bin/reality +7 -0
- data/data/countries.yaml +254 -0
- data/data/country_lists.txt +445 -0
- data/data/country_orgs.yaml +72 -0
- data/data/infoboxes.txt +8804 -0
- data/data/infoboxes_freq.txt +3201 -0
- data/data/infoboxes_freq_sorted.txt +3117 -0
- data/examples/all_countries.rb +16 -0
- data/lib/reality.rb +15 -0
- data/lib/reality/country.rb +283 -0
- data/lib/reality/infoboxer_templates.rb +11 -0
- data/lib/reality/measure.rb +92 -0
- data/lib/reality/measure/unit.rb +120 -0
- data/reality.gemspec +37 -0
- data/script/extract_all_infoboxes.rb +16 -0
- data/script/extract_countries.rb +12 -0
- data/script/extract_country_categories.rb +16 -0
- data/script/extract_infobox_frequency.rb +27 -0
- data/script/lib/faraday_naive_cache.rb +40 -0
- data/script/out/categories.txt +619 -0
- metadata +138 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
$:.unshift 'lib'
|
5
|
+
require 'reality'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
FileUtils.mkdir_p 'examples/output'
|
10
|
+
|
11
|
+
start = Time.now
|
12
|
+
|
13
|
+
File.write 'examples/output/countries.yaml',
|
14
|
+
Reality.countries.to_a.sort_by(&:name).map(&:to_h).to_yaml
|
15
|
+
|
16
|
+
puts "Finished in %i seconds" % (Time.now - start)
|
data/lib/reality.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'infoboxer'
|
2
|
+
|
3
|
+
module Reality
|
4
|
+
require_relative 'reality/infoboxer_templates'
|
5
|
+
|
6
|
+
# basic functionality
|
7
|
+
%w[measure].each do |mod|
|
8
|
+
require_relative "reality/#{mod}"
|
9
|
+
end
|
10
|
+
|
11
|
+
# entities
|
12
|
+
%w[country].each do |mod|
|
13
|
+
require_relative "reality/#{mod}"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,283 @@
|
|
1
|
+
# NB: all of this is early drafts, so may look naive and sub-optimal.
|
2
|
+
# Just stay tuned!
|
3
|
+
|
4
|
+
module Reality
|
5
|
+
class Country
|
6
|
+
class List
|
7
|
+
def initialize(*names)
|
8
|
+
@names = names
|
9
|
+
end
|
10
|
+
|
11
|
+
def count
|
12
|
+
@names.count
|
13
|
+
end
|
14
|
+
|
15
|
+
def first(n = nil)
|
16
|
+
res = get(*@names.first(n || 1))
|
17
|
+
n ? res : res.first
|
18
|
+
end
|
19
|
+
|
20
|
+
def last(n = nil)
|
21
|
+
res = get(*@names.last(n || 1))
|
22
|
+
n ? res : res.first
|
23
|
+
end
|
24
|
+
|
25
|
+
def sample(n = nil)
|
26
|
+
res = get(*@names.sample(n || 1))
|
27
|
+
n ? res : res.first
|
28
|
+
end
|
29
|
+
|
30
|
+
def each(&block)
|
31
|
+
@pages = get(*@names)
|
32
|
+
@pages.each(&block)
|
33
|
+
end
|
34
|
+
|
35
|
+
include Enumerable
|
36
|
+
|
37
|
+
def to_a
|
38
|
+
get(*@names)
|
39
|
+
end
|
40
|
+
|
41
|
+
def where(**filters)
|
42
|
+
names = @names & Reality::Country.
|
43
|
+
by_continents.
|
44
|
+
select{|k, v| v == filters[:continent]}.
|
45
|
+
map(&:first)
|
46
|
+
|
47
|
+
self.class.new(*names)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def get(*names)
|
53
|
+
Reality.wp.get(*names).map{|page| Country.new(page)}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def initialize(page)
|
58
|
+
@page = page
|
59
|
+
end
|
60
|
+
|
61
|
+
def name
|
62
|
+
page.title
|
63
|
+
end
|
64
|
+
|
65
|
+
def long_name
|
66
|
+
infobox.fetch('conventional_long_name').text.strip
|
67
|
+
end
|
68
|
+
|
69
|
+
def capital
|
70
|
+
infobox.fetch('capital').lookup(:Wikilink).first
|
71
|
+
end
|
72
|
+
|
73
|
+
def languages
|
74
|
+
[
|
75
|
+
['Official', infobox_links('official_languages')],
|
76
|
+
[infobox.fetch('languages_type').text.sub(/ languages?$/, ''), infobox_links('languages')]
|
77
|
+
].reject{|k, v| k.empty? || v.empty?}.to_h
|
78
|
+
end
|
79
|
+
|
80
|
+
def tld
|
81
|
+
tlds.first
|
82
|
+
end
|
83
|
+
|
84
|
+
def tlds
|
85
|
+
infobox_links('cctld').map(&:link)
|
86
|
+
end
|
87
|
+
|
88
|
+
def calling_code
|
89
|
+
infobox.fetch('calling_code').text.strip
|
90
|
+
end
|
91
|
+
|
92
|
+
def utc_offset
|
93
|
+
infobox.fetch('utc_offset').text.sub('−', '-').to_i
|
94
|
+
end
|
95
|
+
|
96
|
+
def currency
|
97
|
+
currencies.first
|
98
|
+
end
|
99
|
+
|
100
|
+
def currencies
|
101
|
+
infobox_links('currency').reject{|l| l.link == 'ISO 4217'}
|
102
|
+
end
|
103
|
+
|
104
|
+
def area
|
105
|
+
Reality::Measure(infobox.fetch('area_km2').text.gsub(',', '').to_i, 'km²')
|
106
|
+
end
|
107
|
+
|
108
|
+
def population
|
109
|
+
val = %w[population_estimate population_census].map{|var|
|
110
|
+
infobox.fetch(var).text.strip
|
111
|
+
}.reject(&:empty?).first
|
112
|
+
val && Reality::Measure(parse_maybe_scaled(val), 'person')
|
113
|
+
end
|
114
|
+
|
115
|
+
def gdp_ppp
|
116
|
+
val = infobox.fetch('GDP_PPP').text.strip.sub(/^((Int|US)?\$|USD)/, '')
|
117
|
+
val.empty? ? nil : Reality::Measure(parse_scaled(val), '$')
|
118
|
+
end
|
119
|
+
|
120
|
+
def gdp_nominal
|
121
|
+
val = infobox.fetch('GDP_nominal').text.strip.sub(/^((Int|US)?\$|USD)/, '')
|
122
|
+
val.empty? ? nil : Reality::Measure(parse_scaled(val), '$')
|
123
|
+
end
|
124
|
+
|
125
|
+
alias_method :gdp, :gdp_nominal
|
126
|
+
|
127
|
+
def leaders
|
128
|
+
titles = infobox.fetch(/^leader_title\d/).map(&:text_)
|
129
|
+
names = infobox.fetch(/^leader_name\d/).map{|v| v.lookup(:Wikilink).first}
|
130
|
+
titles.zip(names).to_h
|
131
|
+
end
|
132
|
+
|
133
|
+
def continent
|
134
|
+
self.class.by_continents[page.title]
|
135
|
+
end
|
136
|
+
|
137
|
+
def organizations
|
138
|
+
organizations_list.map{|o| o[:name]}
|
139
|
+
end
|
140
|
+
|
141
|
+
def member_of?(org)
|
142
|
+
organizations_list.any?{|o| o[:name] == org || o[:abbr] == org}
|
143
|
+
end
|
144
|
+
|
145
|
+
def to_s
|
146
|
+
name
|
147
|
+
end
|
148
|
+
|
149
|
+
def inspect
|
150
|
+
"#<#{self.class}(#{name})>"
|
151
|
+
end
|
152
|
+
|
153
|
+
PROPERTIES = %i[
|
154
|
+
continent name long_name
|
155
|
+
tld tlds calling_code utc_offset
|
156
|
+
capital languages currency
|
157
|
+
leaders area population
|
158
|
+
gdp_ppp gdp_nominal
|
159
|
+
]
|
160
|
+
|
161
|
+
def to_h
|
162
|
+
#p self
|
163
|
+
PROPERTIES.
|
164
|
+
map{|prop| [prop, to_simple_type(send(prop))] }.
|
165
|
+
#reject{|prop, val| !val || val.respond_to?(:empty?) && val.empty?}.
|
166
|
+
to_h
|
167
|
+
end
|
168
|
+
|
169
|
+
class << self
|
170
|
+
def by_continents
|
171
|
+
@by_continents ||= Reality.wp.
|
172
|
+
get('List of countries by continent').
|
173
|
+
sections.first.
|
174
|
+
sections.map{|s|
|
175
|
+
continent = s.heading.text_
|
176
|
+
s.tables.first.
|
177
|
+
lookup(:Wikilink, :bold?).map(&:link).
|
178
|
+
map{|country| [country, continent]}
|
179
|
+
}.flatten(1).
|
180
|
+
to_h
|
181
|
+
end
|
182
|
+
|
183
|
+
def organizations
|
184
|
+
@organizations ||= YAML.load(File.read(File.expand_path('../../../data/country_orgs.yaml', __FILE__)))
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
private
|
189
|
+
|
190
|
+
attr_reader :page
|
191
|
+
|
192
|
+
def infobox
|
193
|
+
page.infobox
|
194
|
+
end
|
195
|
+
|
196
|
+
def organizations_list
|
197
|
+
catnames = page.categories.map(&:name)
|
198
|
+
self.class.organizations.select{|o| catnames.include?(o[:category])}
|
199
|
+
end
|
200
|
+
|
201
|
+
def infobox_links(varname)
|
202
|
+
src = infobox.fetch(varname)
|
203
|
+
if tmpl = src.lookup(:Template, name: /list$/).first
|
204
|
+
# values could be both inside and outside list, see India's cctld value
|
205
|
+
src = Infoboxer::Tree::Nodes[src, tmpl.variables]
|
206
|
+
end
|
207
|
+
src.lookup(:Wikilink).uniq
|
208
|
+
end
|
209
|
+
|
210
|
+
# See "Short scale": https://en.wikipedia.org/wiki/Long_and_short_scales#Comparison
|
211
|
+
SCALES = {
|
212
|
+
'million' => 1_000_000,
|
213
|
+
'billion' => 1_000_000_000,
|
214
|
+
'trillion' => 1_000_000_000_000,
|
215
|
+
'quadrillion' => 1_000_000_000_000_000,
|
216
|
+
'quintillion' => 1_000_000_000_000_000_000,
|
217
|
+
'sextillion' => 1_000_000_000_000_000_000_000,
|
218
|
+
'septillion' => 1_000_000_000_000_000_000_000_000,
|
219
|
+
}
|
220
|
+
SCALES_REGEXP = Regexp.union(*SCALES.keys)
|
221
|
+
|
222
|
+
def parse_scaled(str)
|
223
|
+
match, amount, scale = */^([0-9.,]+)[[:space:]]*(#{SCALES_REGEXP})/.match(str)
|
224
|
+
match or
|
225
|
+
fail(ArgumentError, "Unparseable scaled value #{str} for #{self}")
|
226
|
+
|
227
|
+
(amount.gsub(/[,]/, '').to_f * fetch_scale(scale)).to_i
|
228
|
+
end
|
229
|
+
|
230
|
+
def parse_maybe_scaled(str)
|
231
|
+
match, amount, scale = */^([0-9.,]+)[[:space:]]*(#{SCALES_REGEXP})?/.match(str)
|
232
|
+
match or
|
233
|
+
fail(ArgumentError, "Unparseable scaled value #{str} for #{self}")
|
234
|
+
|
235
|
+
if scale
|
236
|
+
(amount.gsub(/[,]/, '').to_f * fetch_scale(scale)).to_i
|
237
|
+
else
|
238
|
+
amount.gsub(/[,]/, '').to_i
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def fetch_scale(str)
|
243
|
+
_, res = SCALES.detect{|key, val| str.start_with?(key)}
|
244
|
+
|
245
|
+
res or fail("Scale not found: #{str} for #{self}")
|
246
|
+
end
|
247
|
+
|
248
|
+
def to_simple_type(val)
|
249
|
+
case val
|
250
|
+
when nil, Numeric, String, Symbol
|
251
|
+
val
|
252
|
+
when Array
|
253
|
+
val.map{|v| to_simple_type(v)}
|
254
|
+
when Hash
|
255
|
+
val.map{|k, v| [to_simple_type(k), to_simple_type(v)]}.to_h
|
256
|
+
when Infoboxer::Tree::Wikilink
|
257
|
+
val.link
|
258
|
+
when Infoboxer::Tree::Node
|
259
|
+
val.text_
|
260
|
+
when Reality::Measure
|
261
|
+
val.amount
|
262
|
+
else
|
263
|
+
fail ArgumentError, "Non-coercible value #{val.class}"
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def Reality.country(name)
|
269
|
+
page = wp.get(name) or return nil
|
270
|
+
# FIXME: not very reliable, as some fictional countries, aliances
|
271
|
+
# and country groups also have this infobox. Or maybe it is acceptable?..
|
272
|
+
page.templates(name: 'Infobox country').empty? ? nil : Country.new(page)
|
273
|
+
end
|
274
|
+
|
275
|
+
def Reality.countries(*names)
|
276
|
+
names = Country.by_continents.keys.sort if names.empty?
|
277
|
+
Country::List.new(*names)
|
278
|
+
end
|
279
|
+
|
280
|
+
def Reality.wp
|
281
|
+
@wp ||= Infoboxer.wp # while Infoboxer recreates wp for each request
|
282
|
+
end
|
283
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Reality
|
2
|
+
class Measure
|
3
|
+
%w[unit].each{|mod| require_relative "measure/#{mod}"}
|
4
|
+
|
5
|
+
attr_reader :amount, :unit
|
6
|
+
|
7
|
+
def initialize(amount, unit)
|
8
|
+
@amount, @unit = Rational(amount), Unit.parse(unit)
|
9
|
+
end
|
10
|
+
|
11
|
+
def <=>(other)
|
12
|
+
check_compatibility!(other)
|
13
|
+
|
14
|
+
amount <=> other.amount
|
15
|
+
end
|
16
|
+
|
17
|
+
def -@
|
18
|
+
self.class.new(-amount, unit)
|
19
|
+
end
|
20
|
+
|
21
|
+
def +(other)
|
22
|
+
check_compatibility!(other)
|
23
|
+
|
24
|
+
self.class.new(amount + other.amount, unit)
|
25
|
+
end
|
26
|
+
|
27
|
+
def -(other)
|
28
|
+
self + (-other)
|
29
|
+
end
|
30
|
+
|
31
|
+
def *(other)
|
32
|
+
case other
|
33
|
+
when Numeric
|
34
|
+
self.class.new(amount * other, unit)
|
35
|
+
when self.class
|
36
|
+
self.class.new(amount * other.amount, unit * other.unit)
|
37
|
+
else
|
38
|
+
fail ArgumentError, "Can't multiply by #{other.class}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def /(other)
|
43
|
+
case other
|
44
|
+
when Numeric
|
45
|
+
self.class.new(amount / other, unit)
|
46
|
+
when self.class
|
47
|
+
un = unit / other.unit
|
48
|
+
un.scalar? ?
|
49
|
+
amount / other.amount :
|
50
|
+
self.class.new(amount / other.amount, un)
|
51
|
+
else
|
52
|
+
fail ArgumentError, "Can't divide by #{other.class}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def **(num)
|
57
|
+
(num-1).times.inject(self){|res| res*self}
|
58
|
+
end
|
59
|
+
|
60
|
+
include Comparable
|
61
|
+
|
62
|
+
def to_s
|
63
|
+
'%s%s' % [formatted_amount, unit]
|
64
|
+
end
|
65
|
+
|
66
|
+
def inspect
|
67
|
+
"#<%s(%s %s)>" % [self.class, formatted_amount, unit]
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def formatted_amount
|
73
|
+
# FIXME: really naive
|
74
|
+
if amount.abs < 1
|
75
|
+
amount.to_f.to_s
|
76
|
+
else
|
77
|
+
# see http://stackoverflow.com/a/6460145/3683228
|
78
|
+
amount.to_i.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def check_compatibility!(other)
|
83
|
+
unless other.kind_of?(self.class) && other.unit == unit
|
84
|
+
fail ArgumentError, "#{self} incompatible with #{other}"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def Reality.Measure(*arg)
|
90
|
+
Measure.new(*arg)
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Reality
|
4
|
+
class Measure
|
5
|
+
class Unit
|
6
|
+
@unicode = true
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_accessor :unicode
|
10
|
+
|
11
|
+
UNIT_REGEX = /[a-zA-Z\$]+/ # FIXME: there are many non-ASCII units, especially in money
|
12
|
+
POWER_REGEX = /[²³]|\^(\d+)/
|
13
|
+
OP_REGEX = /[\/*·]/
|
14
|
+
|
15
|
+
def parse(str)
|
16
|
+
return str if str.kind_of?(Unit)
|
17
|
+
|
18
|
+
scanner = StringScanner.new(str)
|
19
|
+
denom = false
|
20
|
+
units = []
|
21
|
+
|
22
|
+
loop do
|
23
|
+
# (variable [power] operator) ....
|
24
|
+
unit = scanner.scan(UNIT_REGEX) or fail("Variable expected at #{scanner.rest}")
|
25
|
+
pow = scanner.scan(POWER_REGEX)
|
26
|
+
units << [unit, parse_pow(pow, denom)]
|
27
|
+
break if scanner.eos?
|
28
|
+
|
29
|
+
op = scanner.scan(OP_REGEX) or fail("Operator expected at #{scanner.rest}")
|
30
|
+
if op == '/'
|
31
|
+
denom and fail("Second division at #{scanner.rest}")
|
32
|
+
denom = true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
new(*units)
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_pow(p, denom)
|
39
|
+
res = case p
|
40
|
+
when nil then 1
|
41
|
+
when '²' then 2
|
42
|
+
when '³' then 3
|
43
|
+
when /^\^(\d+)$/ then $1.to_i
|
44
|
+
else fail(ArgumentError, "Can't parse power #{p}")
|
45
|
+
end
|
46
|
+
|
47
|
+
denom ? -res : res
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
attr_reader :components
|
52
|
+
|
53
|
+
def initialize(*components)
|
54
|
+
@components = components.
|
55
|
+
group_by{|sig, pow| sig}.
|
56
|
+
map{|sig, cmps| [sig, cmps.map(&:last).inject(:+)]}.
|
57
|
+
reject{|sig, pow| pow.zero?}
|
58
|
+
end
|
59
|
+
|
60
|
+
def ==(other)
|
61
|
+
other.class == self.class && other.components == self.components
|
62
|
+
end
|
63
|
+
|
64
|
+
def scalar?
|
65
|
+
components.empty?
|
66
|
+
end
|
67
|
+
|
68
|
+
def -@
|
69
|
+
self.class.new(*components.map{|sig, pow| [sig, -pow]})
|
70
|
+
end
|
71
|
+
|
72
|
+
def *(other)
|
73
|
+
other.class == self.class or
|
74
|
+
fail(TypeError, "Can't multiply #{self.class} by #{other.class}")
|
75
|
+
|
76
|
+
self.class.new(*components, *other.components)
|
77
|
+
end
|
78
|
+
|
79
|
+
def /(other)
|
80
|
+
other.class == self.class or
|
81
|
+
fail(TypeError, "Can't divide #{self.class} by #{other.class}")
|
82
|
+
|
83
|
+
self * -other
|
84
|
+
end
|
85
|
+
|
86
|
+
def to_s
|
87
|
+
num, denom = components.partition{|sig, pow| pow > 0}
|
88
|
+
numerator = num.map{|sig, pow| "#{sig}#{power(pow)}"}.join(mul)
|
89
|
+
denominator = denom.map{|sig, pow| "#{sig}#{power(pow)}"}.join(mul)
|
90
|
+
case
|
91
|
+
when numerator.empty?
|
92
|
+
[1, denominator].join('/')
|
93
|
+
when denominator.empty?
|
94
|
+
numerator
|
95
|
+
else
|
96
|
+
[numerator, denominator].join('/')
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
UNICODE_SUPER = {2 => '²', 3 => '³'}
|
103
|
+
|
104
|
+
def mul
|
105
|
+
self.class.unicode ? '·' : '*'
|
106
|
+
end
|
107
|
+
|
108
|
+
def power(num)
|
109
|
+
num = num.abs
|
110
|
+
case num
|
111
|
+
when 0 then fail(ArgumentError, "0-power unit!")
|
112
|
+
when 1 then ''
|
113
|
+
when 2..3
|
114
|
+
self.class.unicode ? UNICODE_SUPER.fetch(num) : "^#{num}"
|
115
|
+
else "^#{num}"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|