mode 0.0.5 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/README.md +17 -22
- data/bin/mode +1 -1
- data/lib/mode.rb +34 -6
- data/lib/mode/api/form.rb +53 -0
- data/lib/mode/api/link.rb +31 -0
- data/lib/mode/api/request.rb +181 -0
- data/lib/mode/api/resource.rb +67 -0
- data/lib/mode/auth/access_token.rb +23 -0
- data/lib/mode/cli.rb +3 -3
- data/lib/mode/cli/analyze.rb +1 -1
- data/lib/mode/cli/base.rb +5 -0
- data/lib/mode/cli/connect.rb +18 -0
- data/lib/mode/cli/helpers.rb +0 -9
- data/lib/mode/cli/import.rb +9 -38
- data/lib/mode/cli/login.rb +13 -0
- data/lib/mode/cli/package.rb +2 -5
- data/lib/mode/commands/analyze_field.rb +20 -21
- data/lib/mode/commands/analyze_schema.rb +69 -48
- data/lib/mode/commands/connect.rb +78 -0
- data/lib/mode/commands/helpers.rb +54 -0
- data/lib/mode/commands/import.rb +209 -20
- data/lib/mode/commands/login.rb +111 -0
- data/lib/mode/config.rb +13 -33
- data/lib/mode/configurable.rb +46 -0
- data/lib/mode/connector/config.rb +31 -0
- data/lib/mode/connector/daemon.rb +27 -0
- data/lib/mode/connector/data_source.rb +75 -0
- data/lib/mode/connector/dataset.rb +13 -0
- data/lib/mode/connector/message.rb +31 -0
- data/lib/mode/connector/poller.rb +27 -0
- data/lib/mode/connector/processor.rb +58 -0
- data/lib/mode/connector/registrar.rb +36 -0
- data/lib/mode/connector/scheduler.rb +62 -0
- data/lib/mode/connector/selector.rb +47 -0
- data/lib/mode/connector/type_map.rb +45 -0
- data/lib/mode/connector/uploader.rb +50 -0
- data/lib/mode/logger.rb +202 -0
- data/lib/mode/version.rb +1 -1
- data/mode.gemspec +13 -2
- data/spec/api/form_spec.rb +51 -0
- data/spec/api/link_spec.rb +23 -0
- data/spec/api/request_spec.rb +111 -0
- data/spec/api/resource_spec.rb +70 -0
- data/spec/auth/access_token_spec.rb +22 -0
- data/spec/commands/analyze_field_spec.rb +26 -0
- data/spec/commands/analyze_schema_spec.rb +7 -5
- data/spec/commands/connect_spec.rb +80 -0
- data/spec/commands/helpers_spec.rb +69 -0
- data/spec/commands/import_spec.rb +155 -0
- data/spec/commands/login_spec.rb +178 -0
- data/spec/config_spec.rb +9 -7
- data/spec/connector/config_spec.rb +46 -0
- data/spec/connector/daemon_spec.rb +30 -0
- data/spec/connector/data_source_spec.rb +73 -0
- data/spec/connector/message_spec.rb +22 -0
- data/spec/connector/poller_spec.rb +26 -0
- data/spec/connector/processor_spec.rb +93 -0
- data/spec/connector/registrar_spec.rb +53 -0
- data/spec/connector/scheduler_spec.rb +93 -0
- data/spec/connector/selector_spec.rb +54 -0
- data/spec/connector/type_map_spec.rb +45 -0
- data/spec/connector/uploader_spec.rb +55 -0
- data/spec/fixtures/country-codes/README.md +71 -0
- data/spec/fixtures/country-codes/data/country-codes.csv +250 -0
- data/spec/fixtures/country-codes/datapackage.json +142 -0
- data/spec/fixtures/country-codes/scripts/get_countries_of_earth.py +370 -0
- data/spec/fixtures/country-codes/scripts/reorder_columns.py +8 -0
- data/spec/fixtures/country-codes/scripts/requirements.pip +2 -0
- data/spec/fixtures/espn_draft.csv +473 -1
- data/spec/fixtures/espn_draft/data.csv +473 -0
- data/spec/fixtures/espn_draft/datapackage.json +43 -0
- data/spec/logger_spec.rb +79 -0
- data/spec/spec_helper.rb +6 -1
- metadata +156 -19
- data/lib/mode/cli/setup.rb +0 -12
- data/lib/mode/commands/package.rb +0 -56
- data/lib/mode/commands/setup.rb +0 -36
- data/lib/mode/package_builder.rb +0 -57
- data/spec/commands/setup_spec.rb +0 -62
- data/spec/fixtures/MOCK_DATA.csv +0 -100001
- data/spec/fixtures/cb_clean_small.csv +0 -100000
- data/spec/fixtures/duplicate_keys.csv +0 -3
- data/spec/fixtures/format_examples.csv.txt +0 -6
- data/spec/fixtures/format_examples_after_excel.csv.txt +0 -1
@@ -0,0 +1,142 @@
|
|
1
|
+
{
|
2
|
+
"name": "country-codes",
|
3
|
+
"title": "Comprehensive country codes: ISO 3166, ITU, ISO 4217 currency codes and many more",
|
4
|
+
"format": "csv",
|
5
|
+
"datapackage_version": "1.0-beta.3",
|
6
|
+
"licenses": [
|
7
|
+
{
|
8
|
+
"id": "odc-pddl",
|
9
|
+
"name": "Public Domain Dedication and License",
|
10
|
+
"version": "1.0",
|
11
|
+
"url": "http://opendatacommons.org/licenses/pddl/1.0/"
|
12
|
+
}
|
13
|
+
],
|
14
|
+
"sources": [
|
15
|
+
{
|
16
|
+
"name": "International Organization for Standardization",
|
17
|
+
"web": "http://www.iso.org/iso/country_codes/iso_3166_code_lists.htm"
|
18
|
+
},
|
19
|
+
{
|
20
|
+
"name": "SIX Interbank Clearing Ltd (on behalf of ISO)",
|
21
|
+
"web": "http://www.currency-iso.org/dam/downloads/dl_iso_table_a1.xls"
|
22
|
+
},
|
23
|
+
{
|
24
|
+
"name": "Statoids",
|
25
|
+
"web": "http://www.statoids.com/wab.html"
|
26
|
+
}
|
27
|
+
],
|
28
|
+
"resources": [
|
29
|
+
{
|
30
|
+
"url": "https://raw.github.com/datasets/country-codes/master/data/country-codes.csv",
|
31
|
+
"path": "data/country-codes.csv",
|
32
|
+
"schema": {
|
33
|
+
"fields": [
|
34
|
+
{
|
35
|
+
"name": "name",
|
36
|
+
"description": "Country's official English short name",
|
37
|
+
"type": "string"
|
38
|
+
},
|
39
|
+
{
|
40
|
+
"name": "name_fr",
|
41
|
+
"description": "Country's offical French short name",
|
42
|
+
"type": "string"
|
43
|
+
},
|
44
|
+
{
|
45
|
+
"name": "ISO3166-1-Alpha-2",
|
46
|
+
"description": "Alpha-2 codes from ISO 3166-1",
|
47
|
+
"type": "string"
|
48
|
+
},
|
49
|
+
{
|
50
|
+
"name": "ISO3166-1-Alpha-3",
|
51
|
+
"description": "Alpha-3 codes from ISO 3166-1 (synonymous with World Bank Codes)",
|
52
|
+
"type": "string"
|
53
|
+
},
|
54
|
+
{
|
55
|
+
"name": "ISO3166-1-numeric",
|
56
|
+
"description": "Numeric codes from ISO 3166-1 (synonymous with UN Statistics M49 Codes)",
|
57
|
+
"type": "integer"
|
58
|
+
},
|
59
|
+
{
|
60
|
+
"name": "ITU",
|
61
|
+
"description": "Codes assigned by the International Telecommunications Union",
|
62
|
+
"type": "string"
|
63
|
+
},
|
64
|
+
{
|
65
|
+
"name": "MARC",
|
66
|
+
"description": "MAchine-Readable Cataloging codes from the Library of Congress",
|
67
|
+
"type": "string"
|
68
|
+
},
|
69
|
+
{
|
70
|
+
"name": "WMO",
|
71
|
+
"description": "Country abbreviations by the World Meteorological Organization",
|
72
|
+
"type": "string"
|
73
|
+
},
|
74
|
+
{
|
75
|
+
"name": "DS",
|
76
|
+
"description": "Distinguishing signs of vehicles in international traffic",
|
77
|
+
"type": "string"
|
78
|
+
},
|
79
|
+
{
|
80
|
+
"name": "Dial",
|
81
|
+
"description": "Country code from ITU-T recommendation E.164, sometimes followed by area code",
|
82
|
+
"type": "string"
|
83
|
+
},
|
84
|
+
{
|
85
|
+
"name": "FIFA",
|
86
|
+
"description": "Codes assigned by the Fédération Internationale de Football Association",
|
87
|
+
"type": "string"
|
88
|
+
},
|
89
|
+
{
|
90
|
+
"name": "FIPS",
|
91
|
+
"description": "Codes from the U.S. standard FIPS PUB 10-4",
|
92
|
+
"type": "string"
|
93
|
+
},
|
94
|
+
{
|
95
|
+
"name": "GAUL",
|
96
|
+
"description": "Global Administrative Unit Layers from the Food and Agriculture Organization",
|
97
|
+
"type": "integer"
|
98
|
+
},
|
99
|
+
{
|
100
|
+
"name": "IOC",
|
101
|
+
"description": "Codes assigned by the International Olympics Committee",
|
102
|
+
"type": "string"
|
103
|
+
},
|
104
|
+
{
|
105
|
+
"name": "currency_alphabetic_code",
|
106
|
+
"description": "ISO 4217 currency alphabetic code",
|
107
|
+
"type": "string"
|
108
|
+
},
|
109
|
+
{
|
110
|
+
"name": "currency_country_name",
|
111
|
+
"description": "ISO 4217 country name",
|
112
|
+
"type": "string"
|
113
|
+
},
|
114
|
+
{
|
115
|
+
"name": "currency_minor_unit",
|
116
|
+
"description": "ISO 4217 currency number of minor units",
|
117
|
+
"type": "integer"
|
118
|
+
},
|
119
|
+
{
|
120
|
+
"name": "currency_name",
|
121
|
+
"description": "ISO 4217 currency name",
|
122
|
+
"type": "string"
|
123
|
+
},
|
124
|
+
{
|
125
|
+
"name": "currency_numeric_code",
|
126
|
+
"description": "ISO 4217 currency numeric code",
|
127
|
+
"type": "integer"
|
128
|
+
},
|
129
|
+
{
|
130
|
+
"name": "is_independent",
|
131
|
+
"description": "Country status, based on the CIA World Factbook",
|
132
|
+
"type": "string"
|
133
|
+
}
|
134
|
+
]
|
135
|
+
}
|
136
|
+
}
|
137
|
+
],
|
138
|
+
"maintainers":[{
|
139
|
+
"name": "Evan Wheeler",
|
140
|
+
"web": "https://github.com/datasets/country-codes"
|
141
|
+
}]
|
142
|
+
}
|
@@ -0,0 +1,370 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# vim: ai ts=4 sts=4 et sw=4
|
4
|
+
import codecs
|
5
|
+
import urllib
|
6
|
+
import argparse
|
7
|
+
import json
|
8
|
+
import operator
|
9
|
+
import collections
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
from lxml import etree
|
13
|
+
|
14
|
+
# some ANSI colors, etc
|
15
|
+
BLUE = '\033[94m'
|
16
|
+
GREEN = '\033[92m'
|
17
|
+
YELLOW = '\033[93m'
|
18
|
+
RED = '\033[91m'
|
19
|
+
BOLD = '\033[1m'
|
20
|
+
ENDC = '\033[0m'
|
21
|
+
|
22
|
+
|
23
|
+
def print_info(string):
|
24
|
+
print GREEN + string + ENDC
|
25
|
+
|
26
|
+
|
27
|
+
def print_warn(string):
|
28
|
+
print YELLOW + string + ENDC
|
29
|
+
|
30
|
+
|
31
|
+
def print_error(string):
|
32
|
+
print RED + string + ENDC
|
33
|
+
|
34
|
+
|
35
|
+
def process_statoids_row(tr):
|
36
|
+
row = []
|
37
|
+
for td in tr.iterchildren():
|
38
|
+
if len(td.keys()) > 0:
|
39
|
+
if td.get('colspan') is not None:
|
40
|
+
# if a cell is taking up more than one column,
|
41
|
+
# append the same number of blanks to the row
|
42
|
+
assert td.get('colspan').isdigit()
|
43
|
+
for col in xrange(int(td.get('colspan'))):
|
44
|
+
row.append('')
|
45
|
+
continue
|
46
|
+
if len(td.getchildren()) == 1:
|
47
|
+
if td.find('.//br') is not None:
|
48
|
+
if len(td.getchildren()) == 1:
|
49
|
+
if td.getchildren()[0].tag == 'br':
|
50
|
+
td.text = td.text + " " + td.getchildren()[0].tail
|
51
|
+
row.append(td.text)
|
52
|
+
continue
|
53
|
+
if td.find("code") is not None:
|
54
|
+
# some cells contain more than one code,
|
55
|
+
# so append a list also containing the code
|
56
|
+
# that appears after the child element (<br>)
|
57
|
+
if len(td.find("code").getchildren()) > 0:
|
58
|
+
if td.find('.//br') is not None:
|
59
|
+
row.append(td.find('code').text + ',' + td.find('.//br').tail)
|
60
|
+
continue
|
61
|
+
if td.find('.//a') is not None:
|
62
|
+
anchor = td.find('.//a')
|
63
|
+
# UK has 4 FIFA codes
|
64
|
+
if row[1] == "GB":
|
65
|
+
assert anchor.text == "1"
|
66
|
+
row.append("ENG,NIR,SCO,WAL")
|
67
|
+
continue
|
68
|
+
# MARC treats United States Minor Outlying Islands
|
69
|
+
# as five countries
|
70
|
+
if row[1] == "UM":
|
71
|
+
assert anchor.text == "b"
|
72
|
+
row.append("ji,xf,wk,uc,up")
|
73
|
+
continue
|
74
|
+
# some cells contain anchor to footnote,
|
75
|
+
# so append only the content of the code element
|
76
|
+
row.append(td.find("code").text)
|
77
|
+
continue
|
78
|
+
else:
|
79
|
+
if td.find('.//a') is not None:
|
80
|
+
anchor = td.find('.//a')
|
81
|
+
# FIPS treats United States Minor Outlying Islands
|
82
|
+
# as nine countries
|
83
|
+
if len(row) > 1 and row[1] == "UM":
|
84
|
+
assert anchor.text == "a"
|
85
|
+
row.append("FQ,HQ,DQ,JQ,KQ,MQ,BQ,LQ,WQ")
|
86
|
+
continue
|
87
|
+
row.append(td.text_content())
|
88
|
+
return row
|
89
|
+
|
90
|
+
def clean_line(line):
|
91
|
+
try:
|
92
|
+
line = line.decode('utf8')
|
93
|
+
line = line.rstrip()
|
94
|
+
if ';' in line:
|
95
|
+
semi = line.index(';')
|
96
|
+
name = line[:semi]
|
97
|
+
alpha2 = line[semi + 1:]
|
98
|
+
return (name, alpha2)
|
99
|
+
return (None, None)
|
100
|
+
except UnicodeDecodeError:
|
101
|
+
print_warn('Unable to decode country name: %s' % line)
|
102
|
+
|
103
|
+
def capitalize_country_name(name):
|
104
|
+
# replace all-caps name with capitalized country name
|
105
|
+
cap_list = []
|
106
|
+
always_lower = ['AND', 'THE', 'OF', 'PART', 'DA', 'DE', 'ET', 'DU', 'DES',
|
107
|
+
'LA']
|
108
|
+
for w in name.split():
|
109
|
+
if w == 'MCDONALD':
|
110
|
+
cap_list.append('McDonald')
|
111
|
+
if w.find('.') > 0:
|
112
|
+
cap_list.append(w.upper())
|
113
|
+
continue
|
114
|
+
if w.find('\'') > 0:
|
115
|
+
# d'Ivoire instead of D'ivoire
|
116
|
+
s = w.split('\'')
|
117
|
+
if len(s[0]) == 1:
|
118
|
+
cap_list.append(s[0].lower() + '\'' + s[1].capitalize())
|
119
|
+
continue
|
120
|
+
if w.find('-') > 0:
|
121
|
+
# Timor-Leste instead of Timor-leste
|
122
|
+
cap_list.append('-'.join([s.capitalize() for s in w.split('-')]))
|
123
|
+
continue
|
124
|
+
|
125
|
+
if w.startswith('('):
|
126
|
+
w = w.replace('(', '')
|
127
|
+
if w in always_lower:
|
128
|
+
w = w.lower()
|
129
|
+
else:
|
130
|
+
w = w.capitalize()
|
131
|
+
cap_list.append('(' + w)
|
132
|
+
continue
|
133
|
+
|
134
|
+
if w[-1] == ')':
|
135
|
+
w = w.replace(')', '')
|
136
|
+
if w in always_lower:
|
137
|
+
w = w.lower()
|
138
|
+
else:
|
139
|
+
w = w.capitalize()
|
140
|
+
cap_list.append(w + ')')
|
141
|
+
continue
|
142
|
+
|
143
|
+
if w in always_lower:
|
144
|
+
cap_list.append(w.lower())
|
145
|
+
continue
|
146
|
+
cap_list.append(w.capitalize())
|
147
|
+
|
148
|
+
capitalized = " ".join(cap_list)
|
149
|
+
return capitalized
|
150
|
+
|
151
|
+
|
152
|
+
def get_currency_data(country_info, en_names):
|
153
|
+
# fetch iso currency codes
|
154
|
+
currency_url = "http://www.currency-iso.org/dam/downloads/table_a1.xml"
|
155
|
+
print_info('Fetching currency codes...')
|
156
|
+
currencies_xml_str = urllib.urlopen(currency_url).read()
|
157
|
+
currencies = etree.fromstring(currencies_xml_str)
|
158
|
+
|
159
|
+
# map source's tag names to our property names
|
160
|
+
currency_tag_map = {
|
161
|
+
u"CtryNm": u"currency_country_name",
|
162
|
+
u"CcyNm": u"currency_name",
|
163
|
+
u"Ccy": u"currency_alphabetic_code",
|
164
|
+
u"CcyNbr": u"currency_numeric_code",
|
165
|
+
u"CcyMnrUnts": u"currency_minor_unit",
|
166
|
+
u"AddtlInf": u"currency_additional_info"
|
167
|
+
}
|
168
|
+
# reconcile country names, add entries for non-country-based currencies
|
169
|
+
currency_country_name_map = {
|
170
|
+
u"MACEDONIA, THE FORMER \nYUGOSLAV REPUBLIC OF": "MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF",
|
171
|
+
u"SAINT HELENA, ASCENSION AND \nTRISTAN DA CUNHA": "SAINT HELENA, ASCENSION AND TRISTAN DA CUNHA",
|
172
|
+
u"CONGO, THE DEMOCRATIC REPUBLIC OF": "CONGO, THE DEMOCRATIC REPUBLIC OF THE",
|
173
|
+
u"HEARD ISLAND AND McDONALD ISLANDS": "HEARD ISLAND AND MCDONALD ISLANDS",
|
174
|
+
u"KOREA, DEMOCRATIC PEOPLE’S REPUBLIC OF": "KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF",
|
175
|
+
u"LAO PEOPLE’S DEMOCRATIC REPUBLIC": "LAO PEOPLE'S DEMOCRATIC REPUBLIC",
|
176
|
+
u"SERBIA ": "SERBIA",
|
177
|
+
u"PALESTINIAN TERRITORY, OCCUPIED": "PALESTINE, STATE OF",
|
178
|
+
u"Vatican City State (HOLY SEE)": "HOLY SEE (VATICAN CITY STATE)",
|
179
|
+
u"VIRGIN ISLANDS (BRITISH)": "VIRGIN ISLANDS, BRITISH",
|
180
|
+
u"VIRGIN ISLANDS (US)": "VIRGIN ISLANDS, U.S.",
|
181
|
+
u"MEMBER COUNTRIES OF THE AFRICAN DEVELOPMENT BANK GROUP": None,
|
182
|
+
u"INTERNATIONAL MONETARY FUND (IMF)": None,
|
183
|
+
u"SISTEMA UNITARIO DE COMPENSACION REGIONAL DE PAGOS \"SUCRE\"": None,
|
184
|
+
u"EUROPEAN UNION": None,
|
185
|
+
u"ZZ01_Bond Markets Unit European_EURCO": None,
|
186
|
+
u"ZZ02_Bond Markets Unit European_EMU-6": None,
|
187
|
+
u"ZZ03_Bond Markets Unit European_EUA-9": None,
|
188
|
+
u"ZZ04_Bond Markets Unit European_EUA-17": None,
|
189
|
+
u"ZZ05_UIC-Franc": None,
|
190
|
+
u"ZZ06_Testing_Code": None,
|
191
|
+
u"ZZ07_No_Currency": None,
|
192
|
+
u"ZZ08_Gold": None,
|
193
|
+
u"ZZ09_Palladium": None,
|
194
|
+
u"ZZ10_Platinum": None,
|
195
|
+
u"ZZ11_Silver": None,
|
196
|
+
}
|
197
|
+
|
198
|
+
def process_element(country):
|
199
|
+
currency_dict = {}
|
200
|
+
for currency_tag in country.iterchildren():
|
201
|
+
# ignore newly added additional info field
|
202
|
+
if currency_tag_map[currency_tag.tag] == "currency_additional_info":
|
203
|
+
break
|
204
|
+
# skip 'same day', 'next day', etc variations
|
205
|
+
elif (currency_tag_map[currency_tag.tag] == "currency_name") and (len(currency_tag.items()) > 0):
|
206
|
+
if currency_tag.items()[0][0] == 'IsFund':
|
207
|
+
break
|
208
|
+
else:
|
209
|
+
currency_dict.update({
|
210
|
+
currency_tag_map[currency_tag.tag]: currency_tag.text})
|
211
|
+
currency_alpha2 = None
|
212
|
+
# remove random line breaks, etc
|
213
|
+
currency_name = currency_dict['currency_country_name'].replace(u'\xa0', u'').replace(u'\n', u'').replace(u'\r', u'')
|
214
|
+
if currency_name is not None:
|
215
|
+
# replace name with line breaks, etc removed
|
216
|
+
currency_dict['currency_country_name'] = currency_name
|
217
|
+
try:
|
218
|
+
currency_alpha2 = en_names[currency_name]
|
219
|
+
except KeyError:
|
220
|
+
currency_alpha2 = en_names.get(
|
221
|
+
currency_country_name_map.get(currency_name))
|
222
|
+
|
223
|
+
if currency_alpha2:
|
224
|
+
country_info[currency_alpha2].update(currency_dict)
|
225
|
+
else:
|
226
|
+
if currency_name not in currency_country_name_map:
|
227
|
+
print_warn('Failed to match currency data for country: "%s"'
|
228
|
+
% currency_name)
|
229
|
+
return
|
230
|
+
|
231
|
+
for iso_currency_table in currencies.iterchildren():
|
232
|
+
for country in iso_currency_table.iterchildren():
|
233
|
+
process_element(country)
|
234
|
+
|
235
|
+
return country_info
|
236
|
+
|
237
|
+
def fetch_and_write(options):
|
238
|
+
# fetch ISO short names in English and French
|
239
|
+
print_info('Fetching English country names and codes...')
|
240
|
+
iso_names_en = urllib.urlretrieve('http://www.iso.org/iso/list-en1-semic-3.txt')
|
241
|
+
print_info('Fetching French country names and codes...')
|
242
|
+
iso_names_fr = urllib.urlretrieve('http://www.iso.org/iso/list-fr1-semic.txt')
|
243
|
+
|
244
|
+
# dict for combining en and fr names
|
245
|
+
# {alpha2: {'name': en, 'name_fr': fr}}
|
246
|
+
iso_names = {}
|
247
|
+
|
248
|
+
# dict for looking up alpha2 from name
|
249
|
+
en_names = {}
|
250
|
+
|
251
|
+
# urllib.urlretrieve returns a tuple of (localfile, headers)
|
252
|
+
with open(iso_names_en[0], "rU") as fin:
|
253
|
+
for line in fin:
|
254
|
+
name, alpha2 = clean_line(line)
|
255
|
+
if name and alpha2:
|
256
|
+
iso_names.update({alpha2: {'name': name}})
|
257
|
+
en_names.update({name: alpha2})
|
258
|
+
|
259
|
+
with open(iso_names_fr[0], "rU") as fin:
|
260
|
+
for line in fin:
|
261
|
+
name, alpha2 = clean_line(line)
|
262
|
+
if name and alpha2:
|
263
|
+
if alpha2 in iso_names:
|
264
|
+
# alpha2 should be in iso_names because
|
265
|
+
# english was parsed first,
|
266
|
+
# so append french name to list
|
267
|
+
names = iso_names[alpha2]
|
268
|
+
names.update({'name_fr': name})
|
269
|
+
iso_names.update({alpha2: names})
|
270
|
+
else:
|
271
|
+
# hopefully this doesnt happen, but
|
272
|
+
# in case there was no english name,
|
273
|
+
# add french with a blank space where
|
274
|
+
# english should be
|
275
|
+
names = {'name': '', 'name_fr': name}
|
276
|
+
iso_names.update({alpha2: names})
|
277
|
+
|
278
|
+
# fetch content of statoids.com country code page
|
279
|
+
statoids_url = "http://www.statoids.com/wab.html"
|
280
|
+
print_info('Fetching other country codes...')
|
281
|
+
content = urllib.urlopen(statoids_url).read()
|
282
|
+
doc = html.fromstring(content)
|
283
|
+
|
284
|
+
# i dislike some of statoid's column names, so here i have renamed
|
285
|
+
# a few to be more descriptive
|
286
|
+
column_names = ["Entity", "ISO3166-1-Alpha-2", "ISO3166-1-Alpha-3",
|
287
|
+
"ISO3166-1-numeric", "ITU", "FIPS", "IOC", "FIFA", "DS",
|
288
|
+
"WMO", "GAUL", "MARC", "Dial", "is_independent"]
|
289
|
+
alpha2_key = "ISO3166-1-Alpha-2"
|
290
|
+
|
291
|
+
# comment out the preceding two lines and
|
292
|
+
# uncomment these lines to use statoids.com column names
|
293
|
+
"""
|
294
|
+
column_names = []
|
295
|
+
alpha2_key = 'A-2'
|
296
|
+
for tr in doc.find_class('hd'):
|
297
|
+
for th in tr.iterchildren():
|
298
|
+
column_names.append(th.text_content())
|
299
|
+
"""
|
300
|
+
|
301
|
+
# dict to hold dicts of all table rows
|
302
|
+
table_rows = {}
|
303
|
+
|
304
|
+
# the country code info is in a table where the trs have
|
305
|
+
# alternating classes of `e` and `o`
|
306
|
+
# so fetch half of the rows and zip each row together
|
307
|
+
# with the corresponding column name
|
308
|
+
for tr in doc.find_class('e'):
|
309
|
+
row = process_statoids_row(tr)
|
310
|
+
row_dict = collections.OrderedDict(zip(column_names, row))
|
311
|
+
# statoids-assigned 'Entity' name is not really a standard
|
312
|
+
row_dict.pop('Entity')
|
313
|
+
table_rows.update({row_dict[alpha2_key]: row_dict})
|
314
|
+
|
315
|
+
# and again for the other half
|
316
|
+
for tr in doc.find_class('o'):
|
317
|
+
row = process_statoids_row(tr)
|
318
|
+
row_dict = collections.OrderedDict(zip(column_names, row))
|
319
|
+
# statoids-assigned 'Entity' name is not really a standard
|
320
|
+
row_dict.pop('Entity')
|
321
|
+
table_rows.update({row_dict[alpha2_key]: row_dict})
|
322
|
+
|
323
|
+
# dict to hold combined country info
|
324
|
+
country_info = {}
|
325
|
+
keyed_by = options.key
|
326
|
+
|
327
|
+
# iterate through all the table_rows
|
328
|
+
# TODO this assumes that statoids will have all of
|
329
|
+
# the items that are pulled from iso.org
|
330
|
+
for alpha2, info in table_rows.iteritems():
|
331
|
+
# ignore this crap that was parsed from other tables on the page
|
332
|
+
if alpha2 in ['', 'Codes', 'Codes Codes', 'Codes Codes Codes']:
|
333
|
+
continue
|
334
|
+
cinfo = info
|
335
|
+
# add iso.org's names to combined dict of this country's info
|
336
|
+
cinfo.update(iso_names[alpha2])
|
337
|
+
# replace all-caps name with capitalized country name
|
338
|
+
cinfo.update({'name': capitalize_country_name(cinfo['name'])})
|
339
|
+
cinfo.update({'name_fr': capitalize_country_name(cinfo['name_fr'])})
|
340
|
+
# add combined dict to global (pun intented) data structure
|
341
|
+
ckey = cinfo[keyed_by]
|
342
|
+
country_info.update({ckey: cinfo})
|
343
|
+
|
344
|
+
country_info = get_currency_data(country_info, en_names)
|
345
|
+
|
346
|
+
# reorganize data for export
|
347
|
+
if options.as_list:
|
348
|
+
# if exporting as list, sort by country name
|
349
|
+
country_info = sorted(country_info.values(), key=operator.itemgetter('name'))
|
350
|
+
# dump dict as json to file
|
351
|
+
output_filename = "data/country-codes.json"
|
352
|
+
if options.outfile:
|
353
|
+
output_filename = options.outfile
|
354
|
+
f = open(output_filename, mode='w')
|
355
|
+
stream = codecs.getwriter('utf8')(f)
|
356
|
+
json.dump(country_info, stream, ensure_ascii=False, indent=2, encoding='utf-8')
|
357
|
+
print_info('Saved country data to: %s' % output_filename)
|
358
|
+
|
359
|
+
if __name__ == "__main__":
|
360
|
+
parser = argparse.ArgumentParser(description='Fetch current ISO 3166 country codes and other standards and output as JSON file')
|
361
|
+
parser.add_argument("-o", "--output", dest="outfile", default="data/country-codes.json",
|
362
|
+
help="write data to OUTFILE", metavar="OUTFILE")
|
363
|
+
parser.add_argument("-l", "--list", dest="as_list", default=False, action="store_true",
|
364
|
+
help="export objects as a list of objects")
|
365
|
+
parser.add_argument("-k", "--key", dest="key", default="ISO3166-1-Alpha-2",
|
366
|
+
help="export objects as a dict of objects keyed by KEY", metavar="KEY")
|
367
|
+
|
368
|
+
args = parser.parse_args()
|
369
|
+
|
370
|
+
fetch_and_write(args)
|