mode 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/README.md +17 -22
- data/bin/mode +1 -1
- data/lib/mode.rb +34 -6
- data/lib/mode/api/form.rb +53 -0
- data/lib/mode/api/link.rb +31 -0
- data/lib/mode/api/request.rb +181 -0
- data/lib/mode/api/resource.rb +67 -0
- data/lib/mode/auth/access_token.rb +23 -0
- data/lib/mode/cli.rb +3 -3
- data/lib/mode/cli/analyze.rb +1 -1
- data/lib/mode/cli/base.rb +5 -0
- data/lib/mode/cli/connect.rb +18 -0
- data/lib/mode/cli/helpers.rb +0 -9
- data/lib/mode/cli/import.rb +9 -38
- data/lib/mode/cli/login.rb +13 -0
- data/lib/mode/cli/package.rb +2 -5
- data/lib/mode/commands/analyze_field.rb +20 -21
- data/lib/mode/commands/analyze_schema.rb +69 -48
- data/lib/mode/commands/connect.rb +78 -0
- data/lib/mode/commands/helpers.rb +54 -0
- data/lib/mode/commands/import.rb +209 -20
- data/lib/mode/commands/login.rb +111 -0
- data/lib/mode/config.rb +13 -33
- data/lib/mode/configurable.rb +46 -0
- data/lib/mode/connector/config.rb +31 -0
- data/lib/mode/connector/daemon.rb +27 -0
- data/lib/mode/connector/data_source.rb +75 -0
- data/lib/mode/connector/dataset.rb +13 -0
- data/lib/mode/connector/message.rb +31 -0
- data/lib/mode/connector/poller.rb +27 -0
- data/lib/mode/connector/processor.rb +58 -0
- data/lib/mode/connector/registrar.rb +36 -0
- data/lib/mode/connector/scheduler.rb +62 -0
- data/lib/mode/connector/selector.rb +47 -0
- data/lib/mode/connector/type_map.rb +45 -0
- data/lib/mode/connector/uploader.rb +50 -0
- data/lib/mode/logger.rb +202 -0
- data/lib/mode/version.rb +1 -1
- data/mode.gemspec +13 -2
- data/spec/api/form_spec.rb +51 -0
- data/spec/api/link_spec.rb +23 -0
- data/spec/api/request_spec.rb +111 -0
- data/spec/api/resource_spec.rb +70 -0
- data/spec/auth/access_token_spec.rb +22 -0
- data/spec/commands/analyze_field_spec.rb +26 -0
- data/spec/commands/analyze_schema_spec.rb +7 -5
- data/spec/commands/connect_spec.rb +80 -0
- data/spec/commands/helpers_spec.rb +69 -0
- data/spec/commands/import_spec.rb +155 -0
- data/spec/commands/login_spec.rb +178 -0
- data/spec/config_spec.rb +9 -7
- data/spec/connector/config_spec.rb +46 -0
- data/spec/connector/daemon_spec.rb +30 -0
- data/spec/connector/data_source_spec.rb +73 -0
- data/spec/connector/message_spec.rb +22 -0
- data/spec/connector/poller_spec.rb +26 -0
- data/spec/connector/processor_spec.rb +93 -0
- data/spec/connector/registrar_spec.rb +53 -0
- data/spec/connector/scheduler_spec.rb +93 -0
- data/spec/connector/selector_spec.rb +54 -0
- data/spec/connector/type_map_spec.rb +45 -0
- data/spec/connector/uploader_spec.rb +55 -0
- data/spec/fixtures/country-codes/README.md +71 -0
- data/spec/fixtures/country-codes/data/country-codes.csv +250 -0
- data/spec/fixtures/country-codes/datapackage.json +142 -0
- data/spec/fixtures/country-codes/scripts/get_countries_of_earth.py +370 -0
- data/spec/fixtures/country-codes/scripts/reorder_columns.py +8 -0
- data/spec/fixtures/country-codes/scripts/requirements.pip +2 -0
- data/spec/fixtures/espn_draft.csv +473 -1
- data/spec/fixtures/espn_draft/data.csv +473 -0
- data/spec/fixtures/espn_draft/datapackage.json +43 -0
- data/spec/logger_spec.rb +79 -0
- data/spec/spec_helper.rb +6 -1
- metadata +156 -19
- data/lib/mode/cli/setup.rb +0 -12
- data/lib/mode/commands/package.rb +0 -56
- data/lib/mode/commands/setup.rb +0 -36
- data/lib/mode/package_builder.rb +0 -57
- data/spec/commands/setup_spec.rb +0 -62
- data/spec/fixtures/MOCK_DATA.csv +0 -100001
- data/spec/fixtures/cb_clean_small.csv +0 -100000
- data/spec/fixtures/duplicate_keys.csv +0 -3
- data/spec/fixtures/format_examples.csv.txt +0 -6
- data/spec/fixtures/format_examples_after_excel.csv.txt +0 -1
@@ -0,0 +1,142 @@
|
|
1
|
+
{
|
2
|
+
"name": "country-codes",
|
3
|
+
"title": "Comprehensive country codes: ISO 3166, ITU, ISO 4217 currency codes and many more",
|
4
|
+
"format": "csv",
|
5
|
+
"datapackage_version": "1.0-beta.3",
|
6
|
+
"licenses": [
|
7
|
+
{
|
8
|
+
"id": "odc-pddl",
|
9
|
+
"name": "Public Domain Dedication and License",
|
10
|
+
"version": "1.0",
|
11
|
+
"url": "http://opendatacommons.org/licenses/pddl/1.0/"
|
12
|
+
}
|
13
|
+
],
|
14
|
+
"sources": [
|
15
|
+
{
|
16
|
+
"name": "International Organization for Standardization",
|
17
|
+
"web": "http://www.iso.org/iso/country_codes/iso_3166_code_lists.htm"
|
18
|
+
},
|
19
|
+
{
|
20
|
+
"name": "SIX Interbank Clearing Ltd (on behalf of ISO)",
|
21
|
+
"web": "http://www.currency-iso.org/dam/downloads/dl_iso_table_a1.xls"
|
22
|
+
},
|
23
|
+
{
|
24
|
+
"name": "Statoids",
|
25
|
+
"web": "http://www.statoids.com/wab.html"
|
26
|
+
}
|
27
|
+
],
|
28
|
+
"resources": [
|
29
|
+
{
|
30
|
+
"url": "https://raw.github.com/datasets/country-codes/master/data/country-codes.csv",
|
31
|
+
"path": "data/country-codes.csv",
|
32
|
+
"schema": {
|
33
|
+
"fields": [
|
34
|
+
{
|
35
|
+
"name": "name",
|
36
|
+
"description": "Country's official English short name",
|
37
|
+
"type": "string"
|
38
|
+
},
|
39
|
+
{
|
40
|
+
"name": "name_fr",
|
41
|
+
"description": "Country's offical French short name",
|
42
|
+
"type": "string"
|
43
|
+
},
|
44
|
+
{
|
45
|
+
"name": "ISO3166-1-Alpha-2",
|
46
|
+
"description": "Alpha-2 codes from ISO 3166-1",
|
47
|
+
"type": "string"
|
48
|
+
},
|
49
|
+
{
|
50
|
+
"name": "ISO3166-1-Alpha-3",
|
51
|
+
"description": "Alpha-3 codes from ISO 3166-1 (synonymous with World Bank Codes)",
|
52
|
+
"type": "string"
|
53
|
+
},
|
54
|
+
{
|
55
|
+
"name": "ISO3166-1-numeric",
|
56
|
+
"description": "Numeric codes from ISO 3166-1 (synonymous with UN Statistics M49 Codes)",
|
57
|
+
"type": "integer"
|
58
|
+
},
|
59
|
+
{
|
60
|
+
"name": "ITU",
|
61
|
+
"description": "Codes assigned by the International Telecommunications Union",
|
62
|
+
"type": "string"
|
63
|
+
},
|
64
|
+
{
|
65
|
+
"name": "MARC",
|
66
|
+
"description": "MAchine-Readable Cataloging codes from the Library of Congress",
|
67
|
+
"type": "string"
|
68
|
+
},
|
69
|
+
{
|
70
|
+
"name": "WMO",
|
71
|
+
"description": "Country abbreviations by the World Meteorological Organization",
|
72
|
+
"type": "string"
|
73
|
+
},
|
74
|
+
{
|
75
|
+
"name": "DS",
|
76
|
+
"description": "Distinguishing signs of vehicles in international traffic",
|
77
|
+
"type": "string"
|
78
|
+
},
|
79
|
+
{
|
80
|
+
"name": "Dial",
|
81
|
+
"description": "Country code from ITU-T recommendation E.164, sometimes followed by area code",
|
82
|
+
"type": "string"
|
83
|
+
},
|
84
|
+
{
|
85
|
+
"name": "FIFA",
|
86
|
+
"description": "Codes assigned by the Fédération Internationale de Football Association",
|
87
|
+
"type": "string"
|
88
|
+
},
|
89
|
+
{
|
90
|
+
"name": "FIPS",
|
91
|
+
"description": "Codes from the U.S. standard FIPS PUB 10-4",
|
92
|
+
"type": "string"
|
93
|
+
},
|
94
|
+
{
|
95
|
+
"name": "GAUL",
|
96
|
+
"description": "Global Administrative Unit Layers from the Food and Agriculture Organization",
|
97
|
+
"type": "integer"
|
98
|
+
},
|
99
|
+
{
|
100
|
+
"name": "IOC",
|
101
|
+
"description": "Codes assigned by the International Olympics Committee",
|
102
|
+
"type": "string"
|
103
|
+
},
|
104
|
+
{
|
105
|
+
"name": "currency_alphabetic_code",
|
106
|
+
"description": "ISO 4217 currency alphabetic code",
|
107
|
+
"type": "string"
|
108
|
+
},
|
109
|
+
{
|
110
|
+
"name": "currency_country_name",
|
111
|
+
"description": "ISO 4217 country name",
|
112
|
+
"type": "string"
|
113
|
+
},
|
114
|
+
{
|
115
|
+
"name": "currency_minor_unit",
|
116
|
+
"description": "ISO 4217 currency number of minor units",
|
117
|
+
"type": "integer"
|
118
|
+
},
|
119
|
+
{
|
120
|
+
"name": "currency_name",
|
121
|
+
"description": "ISO 4217 currency name",
|
122
|
+
"type": "string"
|
123
|
+
},
|
124
|
+
{
|
125
|
+
"name": "currency_numeric_code",
|
126
|
+
"description": "ISO 4217 currency numeric code",
|
127
|
+
"type": "integer"
|
128
|
+
},
|
129
|
+
{
|
130
|
+
"name": "is_independent",
|
131
|
+
"description": "Country status, based on the CIA World Factbook",
|
132
|
+
"type": "string"
|
133
|
+
}
|
134
|
+
]
|
135
|
+
}
|
136
|
+
}
|
137
|
+
],
|
138
|
+
"maintainers":[{
|
139
|
+
"name": "Evan Wheeler",
|
140
|
+
"web": "https://github.com/datasets/country-codes"
|
141
|
+
}]
|
142
|
+
}
|
@@ -0,0 +1,370 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# vim: ai ts=4 sts=4 et sw=4
|
4
|
+
import codecs
|
5
|
+
import urllib
|
6
|
+
import argparse
|
7
|
+
import json
|
8
|
+
import operator
|
9
|
+
import collections
|
10
|
+
|
11
|
+
from lxml import html
|
12
|
+
from lxml import etree
|
13
|
+
|
14
|
+
# some ANSI colors, etc
|
15
|
+
BLUE = '\033[94m'
|
16
|
+
GREEN = '\033[92m'
|
17
|
+
YELLOW = '\033[93m'
|
18
|
+
RED = '\033[91m'
|
19
|
+
BOLD = '\033[1m'
|
20
|
+
ENDC = '\033[0m'
|
21
|
+
|
22
|
+
|
23
|
+
def print_info(string):
|
24
|
+
print GREEN + string + ENDC
|
25
|
+
|
26
|
+
|
27
|
+
def print_warn(string):
|
28
|
+
print YELLOW + string + ENDC
|
29
|
+
|
30
|
+
|
31
|
+
def print_error(string):
|
32
|
+
print RED + string + ENDC
|
33
|
+
|
34
|
+
|
35
|
+
def process_statoids_row(tr):
|
36
|
+
row = []
|
37
|
+
for td in tr.iterchildren():
|
38
|
+
if len(td.keys()) > 0:
|
39
|
+
if td.get('colspan') is not None:
|
40
|
+
# if a cell is taking up more than one column,
|
41
|
+
# append the same number of blanks to the row
|
42
|
+
assert td.get('colspan').isdigit()
|
43
|
+
for col in xrange(int(td.get('colspan'))):
|
44
|
+
row.append('')
|
45
|
+
continue
|
46
|
+
if len(td.getchildren()) == 1:
|
47
|
+
if td.find('.//br') is not None:
|
48
|
+
if len(td.getchildren()) == 1:
|
49
|
+
if td.getchildren()[0].tag == 'br':
|
50
|
+
td.text = td.text + " " + td.getchildren()[0].tail
|
51
|
+
row.append(td.text)
|
52
|
+
continue
|
53
|
+
if td.find("code") is not None:
|
54
|
+
# some cells contain more than one code,
|
55
|
+
# so append a list also containing the code
|
56
|
+
# that appears after the child element (<br>)
|
57
|
+
if len(td.find("code").getchildren()) > 0:
|
58
|
+
if td.find('.//br') is not None:
|
59
|
+
row.append(td.find('code').text + ',' + td.find('.//br').tail)
|
60
|
+
continue
|
61
|
+
if td.find('.//a') is not None:
|
62
|
+
anchor = td.find('.//a')
|
63
|
+
# UK has 4 FIFA codes
|
64
|
+
if row[1] == "GB":
|
65
|
+
assert anchor.text == "1"
|
66
|
+
row.append("ENG,NIR,SCO,WAL")
|
67
|
+
continue
|
68
|
+
# MARC treats United States Minor Outlying Islands
|
69
|
+
# as five countries
|
70
|
+
if row[1] == "UM":
|
71
|
+
assert anchor.text == "b"
|
72
|
+
row.append("ji,xf,wk,uc,up")
|
73
|
+
continue
|
74
|
+
# some cells contain anchor to footnote,
|
75
|
+
# so append only the content of the code element
|
76
|
+
row.append(td.find("code").text)
|
77
|
+
continue
|
78
|
+
else:
|
79
|
+
if td.find('.//a') is not None:
|
80
|
+
anchor = td.find('.//a')
|
81
|
+
# FIPS treats United States Minor Outlying Islands
|
82
|
+
# as nine countries
|
83
|
+
if len(row) > 1 and row[1] == "UM":
|
84
|
+
assert anchor.text == "a"
|
85
|
+
row.append("FQ,HQ,DQ,JQ,KQ,MQ,BQ,LQ,WQ")
|
86
|
+
continue
|
87
|
+
row.append(td.text_content())
|
88
|
+
return row
|
89
|
+
|
90
|
+
def clean_line(line):
|
91
|
+
try:
|
92
|
+
line = line.decode('utf8')
|
93
|
+
line = line.rstrip()
|
94
|
+
if ';' in line:
|
95
|
+
semi = line.index(';')
|
96
|
+
name = line[:semi]
|
97
|
+
alpha2 = line[semi + 1:]
|
98
|
+
return (name, alpha2)
|
99
|
+
return (None, None)
|
100
|
+
except UnicodeDecodeError:
|
101
|
+
print_warn('Unable to decode country name: %s' % line)
|
102
|
+
|
103
|
+
def capitalize_country_name(name):
|
104
|
+
# replace all-caps name with capitalized country name
|
105
|
+
cap_list = []
|
106
|
+
always_lower = ['AND', 'THE', 'OF', 'PART', 'DA', 'DE', 'ET', 'DU', 'DES',
|
107
|
+
'LA']
|
108
|
+
for w in name.split():
|
109
|
+
if w == 'MCDONALD':
|
110
|
+
cap_list.append('McDonald')
|
111
|
+
if w.find('.') > 0:
|
112
|
+
cap_list.append(w.upper())
|
113
|
+
continue
|
114
|
+
if w.find('\'') > 0:
|
115
|
+
# d'Ivoire instead of D'ivoire
|
116
|
+
s = w.split('\'')
|
117
|
+
if len(s[0]) == 1:
|
118
|
+
cap_list.append(s[0].lower() + '\'' + s[1].capitalize())
|
119
|
+
continue
|
120
|
+
if w.find('-') > 0:
|
121
|
+
# Timor-Leste instead of Timor-leste
|
122
|
+
cap_list.append('-'.join([s.capitalize() for s in w.split('-')]))
|
123
|
+
continue
|
124
|
+
|
125
|
+
if w.startswith('('):
|
126
|
+
w = w.replace('(', '')
|
127
|
+
if w in always_lower:
|
128
|
+
w = w.lower()
|
129
|
+
else:
|
130
|
+
w = w.capitalize()
|
131
|
+
cap_list.append('(' + w)
|
132
|
+
continue
|
133
|
+
|
134
|
+
if w[-1] == ')':
|
135
|
+
w = w.replace(')', '')
|
136
|
+
if w in always_lower:
|
137
|
+
w = w.lower()
|
138
|
+
else:
|
139
|
+
w = w.capitalize()
|
140
|
+
cap_list.append(w + ')')
|
141
|
+
continue
|
142
|
+
|
143
|
+
if w in always_lower:
|
144
|
+
cap_list.append(w.lower())
|
145
|
+
continue
|
146
|
+
cap_list.append(w.capitalize())
|
147
|
+
|
148
|
+
capitalized = " ".join(cap_list)
|
149
|
+
return capitalized
|
150
|
+
|
151
|
+
|
152
|
+
def get_currency_data(country_info, en_names):
|
153
|
+
# fetch iso currency codes
|
154
|
+
currency_url = "http://www.currency-iso.org/dam/downloads/table_a1.xml"
|
155
|
+
print_info('Fetching currency codes...')
|
156
|
+
currencies_xml_str = urllib.urlopen(currency_url).read()
|
157
|
+
currencies = etree.fromstring(currencies_xml_str)
|
158
|
+
|
159
|
+
# map source's tag names to our property names
|
160
|
+
currency_tag_map = {
|
161
|
+
u"CtryNm": u"currency_country_name",
|
162
|
+
u"CcyNm": u"currency_name",
|
163
|
+
u"Ccy": u"currency_alphabetic_code",
|
164
|
+
u"CcyNbr": u"currency_numeric_code",
|
165
|
+
u"CcyMnrUnts": u"currency_minor_unit",
|
166
|
+
u"AddtlInf": u"currency_additional_info"
|
167
|
+
}
|
168
|
+
# reconcile country names, add entries for non-country-based currencies
|
169
|
+
currency_country_name_map = {
|
170
|
+
u"MACEDONIA, THE FORMER \nYUGOSLAV REPUBLIC OF": "MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF",
|
171
|
+
u"SAINT HELENA, ASCENSION AND \nTRISTAN DA CUNHA": "SAINT HELENA, ASCENSION AND TRISTAN DA CUNHA",
|
172
|
+
u"CONGO, THE DEMOCRATIC REPUBLIC OF": "CONGO, THE DEMOCRATIC REPUBLIC OF THE",
|
173
|
+
u"HEARD ISLAND AND McDONALD ISLANDS": "HEARD ISLAND AND MCDONALD ISLANDS",
|
174
|
+
u"KOREA, DEMOCRATIC PEOPLE’S REPUBLIC OF": "KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF",
|
175
|
+
u"LAO PEOPLE’S DEMOCRATIC REPUBLIC": "LAO PEOPLE'S DEMOCRATIC REPUBLIC",
|
176
|
+
u"SERBIA ": "SERBIA",
|
177
|
+
u"PALESTINIAN TERRITORY, OCCUPIED": "PALESTINE, STATE OF",
|
178
|
+
u"Vatican City State (HOLY SEE)": "HOLY SEE (VATICAN CITY STATE)",
|
179
|
+
u"VIRGIN ISLANDS (BRITISH)": "VIRGIN ISLANDS, BRITISH",
|
180
|
+
u"VIRGIN ISLANDS (US)": "VIRGIN ISLANDS, U.S.",
|
181
|
+
u"MEMBER COUNTRIES OF THE AFRICAN DEVELOPMENT BANK GROUP": None,
|
182
|
+
u"INTERNATIONAL MONETARY FUND (IMF)": None,
|
183
|
+
u"SISTEMA UNITARIO DE COMPENSACION REGIONAL DE PAGOS \"SUCRE\"": None,
|
184
|
+
u"EUROPEAN UNION": None,
|
185
|
+
u"ZZ01_Bond Markets Unit European_EURCO": None,
|
186
|
+
u"ZZ02_Bond Markets Unit European_EMU-6": None,
|
187
|
+
u"ZZ03_Bond Markets Unit European_EUA-9": None,
|
188
|
+
u"ZZ04_Bond Markets Unit European_EUA-17": None,
|
189
|
+
u"ZZ05_UIC-Franc": None,
|
190
|
+
u"ZZ06_Testing_Code": None,
|
191
|
+
u"ZZ07_No_Currency": None,
|
192
|
+
u"ZZ08_Gold": None,
|
193
|
+
u"ZZ09_Palladium": None,
|
194
|
+
u"ZZ10_Platinum": None,
|
195
|
+
u"ZZ11_Silver": None,
|
196
|
+
}
|
197
|
+
|
198
|
+
def process_element(country):
|
199
|
+
currency_dict = {}
|
200
|
+
for currency_tag in country.iterchildren():
|
201
|
+
# ignore newly added additional info field
|
202
|
+
if currency_tag_map[currency_tag.tag] == "currency_additional_info":
|
203
|
+
break
|
204
|
+
# skip 'same day', 'next day', etc variations
|
205
|
+
elif (currency_tag_map[currency_tag.tag] == "currency_name") and (len(currency_tag.items()) > 0):
|
206
|
+
if currency_tag.items()[0][0] == 'IsFund':
|
207
|
+
break
|
208
|
+
else:
|
209
|
+
currency_dict.update({
|
210
|
+
currency_tag_map[currency_tag.tag]: currency_tag.text})
|
211
|
+
currency_alpha2 = None
|
212
|
+
# remove random line breaks, etc
|
213
|
+
currency_name = currency_dict['currency_country_name'].replace(u'\xa0', u'').replace(u'\n', u'').replace(u'\r', u'')
|
214
|
+
if currency_name is not None:
|
215
|
+
# replace name with line breaks, etc removed
|
216
|
+
currency_dict['currency_country_name'] = currency_name
|
217
|
+
try:
|
218
|
+
currency_alpha2 = en_names[currency_name]
|
219
|
+
except KeyError:
|
220
|
+
currency_alpha2 = en_names.get(
|
221
|
+
currency_country_name_map.get(currency_name))
|
222
|
+
|
223
|
+
if currency_alpha2:
|
224
|
+
country_info[currency_alpha2].update(currency_dict)
|
225
|
+
else:
|
226
|
+
if currency_name not in currency_country_name_map:
|
227
|
+
print_warn('Failed to match currency data for country: "%s"'
|
228
|
+
% currency_name)
|
229
|
+
return
|
230
|
+
|
231
|
+
for iso_currency_table in currencies.iterchildren():
|
232
|
+
for country in iso_currency_table.iterchildren():
|
233
|
+
process_element(country)
|
234
|
+
|
235
|
+
return country_info
|
236
|
+
|
237
|
+
def fetch_and_write(options):
|
238
|
+
# fetch ISO short names in English and French
|
239
|
+
print_info('Fetching English country names and codes...')
|
240
|
+
iso_names_en = urllib.urlretrieve('http://www.iso.org/iso/list-en1-semic-3.txt')
|
241
|
+
print_info('Fetching French country names and codes...')
|
242
|
+
iso_names_fr = urllib.urlretrieve('http://www.iso.org/iso/list-fr1-semic.txt')
|
243
|
+
|
244
|
+
# dict for combining en and fr names
|
245
|
+
# {alpha2: {'name': en, 'name_fr': fr}}
|
246
|
+
iso_names = {}
|
247
|
+
|
248
|
+
# dict for looking up alpha2 from name
|
249
|
+
en_names = {}
|
250
|
+
|
251
|
+
# urllib.urlretrieve returns a tuple of (localfile, headers)
|
252
|
+
with open(iso_names_en[0], "rU") as fin:
|
253
|
+
for line in fin:
|
254
|
+
name, alpha2 = clean_line(line)
|
255
|
+
if name and alpha2:
|
256
|
+
iso_names.update({alpha2: {'name': name}})
|
257
|
+
en_names.update({name: alpha2})
|
258
|
+
|
259
|
+
with open(iso_names_fr[0], "rU") as fin:
|
260
|
+
for line in fin:
|
261
|
+
name, alpha2 = clean_line(line)
|
262
|
+
if name and alpha2:
|
263
|
+
if alpha2 in iso_names:
|
264
|
+
# alpha2 should be in iso_names because
|
265
|
+
# english was parsed first,
|
266
|
+
# so append french name to list
|
267
|
+
names = iso_names[alpha2]
|
268
|
+
names.update({'name_fr': name})
|
269
|
+
iso_names.update({alpha2: names})
|
270
|
+
else:
|
271
|
+
# hopefully this doesnt happen, but
|
272
|
+
# in case there was no english name,
|
273
|
+
# add french with a blank space where
|
274
|
+
# english should be
|
275
|
+
names = {'name': '', 'name_fr': name}
|
276
|
+
iso_names.update({alpha2: names})
|
277
|
+
|
278
|
+
# fetch content of statoids.com country code page
|
279
|
+
statoids_url = "http://www.statoids.com/wab.html"
|
280
|
+
print_info('Fetching other country codes...')
|
281
|
+
content = urllib.urlopen(statoids_url).read()
|
282
|
+
doc = html.fromstring(content)
|
283
|
+
|
284
|
+
# i dislike some of statoid's column names, so here i have renamed
|
285
|
+
# a few to be more descriptive
|
286
|
+
column_names = ["Entity", "ISO3166-1-Alpha-2", "ISO3166-1-Alpha-3",
|
287
|
+
"ISO3166-1-numeric", "ITU", "FIPS", "IOC", "FIFA", "DS",
|
288
|
+
"WMO", "GAUL", "MARC", "Dial", "is_independent"]
|
289
|
+
alpha2_key = "ISO3166-1-Alpha-2"
|
290
|
+
|
291
|
+
# comment out the preceding two lines and
|
292
|
+
# uncomment these lines to use statoids.com column names
|
293
|
+
"""
|
294
|
+
column_names = []
|
295
|
+
alpha2_key = 'A-2'
|
296
|
+
for tr in doc.find_class('hd'):
|
297
|
+
for th in tr.iterchildren():
|
298
|
+
column_names.append(th.text_content())
|
299
|
+
"""
|
300
|
+
|
301
|
+
# dict to hold dicts of all table rows
|
302
|
+
table_rows = {}
|
303
|
+
|
304
|
+
# the country code info is in a table where the trs have
|
305
|
+
# alternating classes of `e` and `o`
|
306
|
+
# so fetch half of the rows and zip each row together
|
307
|
+
# with the corresponding column name
|
308
|
+
for tr in doc.find_class('e'):
|
309
|
+
row = process_statoids_row(tr)
|
310
|
+
row_dict = collections.OrderedDict(zip(column_names, row))
|
311
|
+
# statoids-assigned 'Entity' name is not really a standard
|
312
|
+
row_dict.pop('Entity')
|
313
|
+
table_rows.update({row_dict[alpha2_key]: row_dict})
|
314
|
+
|
315
|
+
# and again for the other half
|
316
|
+
for tr in doc.find_class('o'):
|
317
|
+
row = process_statoids_row(tr)
|
318
|
+
row_dict = collections.OrderedDict(zip(column_names, row))
|
319
|
+
# statoids-assigned 'Entity' name is not really a standard
|
320
|
+
row_dict.pop('Entity')
|
321
|
+
table_rows.update({row_dict[alpha2_key]: row_dict})
|
322
|
+
|
323
|
+
# dict to hold combined country info
|
324
|
+
country_info = {}
|
325
|
+
keyed_by = options.key
|
326
|
+
|
327
|
+
# iterate through all the table_rows
|
328
|
+
# TODO this assumes that statoids will have all of
|
329
|
+
# the items that are pulled from iso.org
|
330
|
+
for alpha2, info in table_rows.iteritems():
|
331
|
+
# ignore this crap that was parsed from other tables on the page
|
332
|
+
if alpha2 in ['', 'Codes', 'Codes Codes', 'Codes Codes Codes']:
|
333
|
+
continue
|
334
|
+
cinfo = info
|
335
|
+
# add iso.org's names to combined dict of this country's info
|
336
|
+
cinfo.update(iso_names[alpha2])
|
337
|
+
# replace all-caps name with capitalized country name
|
338
|
+
cinfo.update({'name': capitalize_country_name(cinfo['name'])})
|
339
|
+
cinfo.update({'name_fr': capitalize_country_name(cinfo['name_fr'])})
|
340
|
+
# add combined dict to global (pun intented) data structure
|
341
|
+
ckey = cinfo[keyed_by]
|
342
|
+
country_info.update({ckey: cinfo})
|
343
|
+
|
344
|
+
country_info = get_currency_data(country_info, en_names)
|
345
|
+
|
346
|
+
# reorganize data for export
|
347
|
+
if options.as_list:
|
348
|
+
# if exporting as list, sort by country name
|
349
|
+
country_info = sorted(country_info.values(), key=operator.itemgetter('name'))
|
350
|
+
# dump dict as json to file
|
351
|
+
output_filename = "data/country-codes.json"
|
352
|
+
if options.outfile:
|
353
|
+
output_filename = options.outfile
|
354
|
+
f = open(output_filename, mode='w')
|
355
|
+
stream = codecs.getwriter('utf8')(f)
|
356
|
+
json.dump(country_info, stream, ensure_ascii=False, indent=2, encoding='utf-8')
|
357
|
+
print_info('Saved country data to: %s' % output_filename)
|
358
|
+
|
359
|
+
if __name__ == "__main__":
|
360
|
+
parser = argparse.ArgumentParser(description='Fetch current ISO 3166 country codes and other standards and output as JSON file')
|
361
|
+
parser.add_argument("-o", "--output", dest="outfile", default="data/country-codes.json",
|
362
|
+
help="write data to OUTFILE", metavar="OUTFILE")
|
363
|
+
parser.add_argument("-l", "--list", dest="as_list", default=False, action="store_true",
|
364
|
+
help="export objects as a list of objects")
|
365
|
+
parser.add_argument("-k", "--key", dest="key", default="ISO3166-1-Alpha-2",
|
366
|
+
help="export objects as a dict of objects keyed by KEY", metavar="KEY")
|
367
|
+
|
368
|
+
args = parser.parse_args()
|
369
|
+
|
370
|
+
fetch_and_write(args)
|