hdx-python-scraper 2.1.9__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdx/scraper/_version.py +14 -2
- hdx/scraper/base_scraper.py +13 -4
- hdx/scraper/configurable/scraper.py +6 -1
- hdx/scraper/runner.py +49 -23
- hdx/scraper/utilities/reader.py +15 -7
- {hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/METADATA +3 -3
- {hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/RECORD +9 -9
- {hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/WHEEL +0 -0
- {hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/licenses/LICENSE +0 -0
hdx/scraper/_version.py
CHANGED
|
@@ -1,4 +1,16 @@
|
|
|
1
1
|
# file generated by setuptools_scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
TYPE_CHECKING = False
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from typing import Tuple, Union
|
|
6
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
|
+
else:
|
|
8
|
+
VERSION_TUPLE = object
|
|
9
|
+
|
|
10
|
+
version: str
|
|
11
|
+
__version__: str
|
|
12
|
+
__version_tuple__: VERSION_TUPLE
|
|
13
|
+
version_tuple: VERSION_TUPLE
|
|
14
|
+
|
|
15
|
+
__version__ = version = '2.2.1'
|
|
16
|
+
__version_tuple__ = version_tuple = (2, 2, 1)
|
hdx/scraper/base_scraper.py
CHANGED
|
@@ -346,14 +346,23 @@ class BaseScraper(ABC):
|
|
|
346
346
|
"""
|
|
347
347
|
return self.source_urls
|
|
348
348
|
|
|
349
|
-
def
|
|
349
|
+
def get_hapi_dataset_metadata(self) -> Optional[Dict]:
|
|
350
350
|
"""
|
|
351
|
-
Get HAPI metadata
|
|
351
|
+
Get HAPI dataset metadata
|
|
352
352
|
|
|
353
353
|
Returns:
|
|
354
|
-
Optional[Dict]: HAPI metadata
|
|
354
|
+
Optional[Dict]: HAPI dataset metadata
|
|
355
355
|
"""
|
|
356
|
-
return self.datasetinfo.get("
|
|
356
|
+
return self.datasetinfo.get("hapi_dataset_metadata")
|
|
357
|
+
|
|
358
|
+
def get_hapi_resource_metadata(self) -> Optional[Dict]:
|
|
359
|
+
"""
|
|
360
|
+
Get HAPI resource metadata
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Optional[Dict]: HAPI resource metadata
|
|
364
|
+
"""
|
|
365
|
+
return self.datasetinfo.get("hapi_resource_metadata")
|
|
357
366
|
|
|
358
367
|
def add_population(self) -> None:
|
|
359
368
|
"""
|
|
@@ -122,6 +122,7 @@ class ConfigurableScraper(BaseScraper):
|
|
|
122
122
|
"input": datasetinfo.get("input", []),
|
|
123
123
|
"transform": datasetinfo.get("transform", {}),
|
|
124
124
|
"population_key": datasetinfo.get("population_key"),
|
|
125
|
+
"list": datasetinfo.get("list", []),
|
|
125
126
|
"process": datasetinfo.get("process", []),
|
|
126
127
|
"input_keep": datasetinfo.get("input_keep", []),
|
|
127
128
|
"input_append": datasetinfo.get("input_append", []),
|
|
@@ -292,6 +293,7 @@ class ConfigurableScraper(BaseScraper):
|
|
|
292
293
|
filter = subset["filter"]
|
|
293
294
|
input_ignore_vals = subset.get("input_ignore_vals", [])
|
|
294
295
|
input_transforms = subset.get("transform", {})
|
|
296
|
+
list_cols = subset.get("list")
|
|
295
297
|
sum_cols = subset.get("sum")
|
|
296
298
|
process_cols = subset.get("process")
|
|
297
299
|
input_append = subset.get("input_append", [])
|
|
@@ -304,6 +306,8 @@ class ConfigurableScraper(BaseScraper):
|
|
|
304
306
|
val = eval(input_transform.replace(valcol, "val"))
|
|
305
307
|
if sum_cols or process_cols:
|
|
306
308
|
dict_of_lists_add(valuedict, adm, val)
|
|
309
|
+
elif list_cols and valcol in list_cols:
|
|
310
|
+
dict_of_lists_add(valuedict, adm, val)
|
|
307
311
|
else:
|
|
308
312
|
curval = valuedict.get(adm)
|
|
309
313
|
if valcol in input_append:
|
|
@@ -326,6 +330,7 @@ class ConfigurableScraper(BaseScraper):
|
|
|
326
330
|
population_str = "self.population_lookup[adm]"
|
|
327
331
|
else:
|
|
328
332
|
population_str = "self.population_lookup[population_key]"
|
|
333
|
+
subset.get("list")
|
|
329
334
|
process_cols = subset.get("process")
|
|
330
335
|
input_keep = subset.get("input_keep", [])
|
|
331
336
|
sum_cols = subset.get("sum")
|
|
@@ -440,7 +445,7 @@ class ConfigurableScraper(BaseScraper):
|
|
|
440
445
|
valcols[i], f"newvaldicts[{i}][adm]"
|
|
441
446
|
)
|
|
442
447
|
formula = formula.replace("#pzbgvjh", population_str)
|
|
443
|
-
for adm in valdicts[0]
|
|
448
|
+
for adm in valdicts[0]:
|
|
444
449
|
try:
|
|
445
450
|
val = eval(formula)
|
|
446
451
|
except (ValueError, TypeError, KeyError):
|
hdx/scraper/runner.py
CHANGED
|
@@ -1146,51 +1146,67 @@ class Runner:
|
|
|
1146
1146
|
|
|
1147
1147
|
def get_hapi_metadata(
|
|
1148
1148
|
self, names: Optional[ListTuple[str]] = None
|
|
1149
|
-
) ->
|
|
1150
|
-
"""Get HAPI metadata for all datasets
|
|
1149
|
+
) -> Dict:
|
|
1150
|
+
"""Get HAPI metadata for all datasets. A dictionary is returned that
|
|
1151
|
+
maps from dataset ids to a dictionary. The dictionary has keys for
|
|
1152
|
+
dataset metadata and a key resources under which is a dictionary that
|
|
1153
|
+
maps from resource ids to resource metadata.
|
|
1151
1154
|
|
|
1152
1155
|
Args:
|
|
1153
1156
|
names (Optional[ListTuple[str]]): Names of scrapers
|
|
1154
1157
|
|
|
1155
1158
|
Returns:
|
|
1156
|
-
|
|
1159
|
+
Dict: HAPI metadata for all datasets
|
|
1157
1160
|
"""
|
|
1158
1161
|
if not names:
|
|
1159
1162
|
names = self.scrapers.keys()
|
|
1160
|
-
|
|
1163
|
+
results = {}
|
|
1161
1164
|
for name in names:
|
|
1162
1165
|
scraper = self.get_scraper(name)
|
|
1163
1166
|
if not scraper.has_run:
|
|
1164
1167
|
continue
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1168
|
+
hapi_dataset_metadata = scraper.get_hapi_dataset_metadata()
|
|
1169
|
+
hapi_resource_metadata = scraper.get_hapi_resource_metadata()
|
|
1170
|
+
dataset_id = hapi_dataset_metadata["hdx_id"]
|
|
1171
|
+
resource_id = hapi_resource_metadata["hdx_id"]
|
|
1172
|
+
hapi_metadata = results.get(
|
|
1173
|
+
dataset_id, copy(hapi_dataset_metadata)
|
|
1174
|
+
)
|
|
1175
|
+
hapi_resources = hapi_metadata.get("resources", {})
|
|
1176
|
+
hapi_resources[resource_id] = hapi_resource_metadata
|
|
1177
|
+
hapi_metadata["resources"] = hapi_resources
|
|
1178
|
+
results[dataset_id] = hapi_metadata
|
|
1179
|
+
return results
|
|
1169
1180
|
|
|
1170
1181
|
def get_hapi_results(
|
|
1171
1182
|
self,
|
|
1172
1183
|
names: Optional[ListTuple[str]] = None,
|
|
1173
1184
|
has_run: bool = True,
|
|
1174
|
-
) ->
|
|
1175
|
-
"""Get the results (headers
|
|
1176
|
-
limiting to those in names if given and limiting
|
|
1177
|
-
have been set in the constructor if previously
|
|
1178
|
-
scrapers marked as having run are returned
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1185
|
+
) -> Dict:
|
|
1186
|
+
"""Get the results (headers and values per admin level and HAPI
|
|
1187
|
+
metadata) for scrapers limiting to those in names if given and limiting
|
|
1188
|
+
further to those that have been set in the constructor if previously
|
|
1189
|
+
given. By default, only scrapers marked as having run are returned
|
|
1190
|
+
unless has_run is set to False.
|
|
1191
|
+
|
|
1192
|
+
A dictionary is returned where key is HDX dataset id and value is a
|
|
1193
|
+
dictionary that has HAPI dataset metadata as well as a results key.
|
|
1194
|
+
The value associated with the results key is a dictionary where each
|
|
1195
|
+
key is an admin level. Each admin level key has a value dictionary with
|
|
1196
|
+
headers, values and HAPI resource metadata. Headers is a tuple of
|
|
1197
|
+
(column headers, hxl hashtags). Values is a list. HAPI resource
|
|
1198
|
+
metadata is a dictionary.
|
|
1183
1199
|
|
|
1184
1200
|
Args:
|
|
1185
1201
|
names (Optional[ListTuple[str]]): Names of scrapers. Defaults to None (all scrapers).
|
|
1186
1202
|
has_run (bool): Only get results for scrapers marked as having run. Defaults to True.
|
|
1187
1203
|
|
|
1188
1204
|
Returns:
|
|
1189
|
-
|
|
1205
|
+
Dict: Headers and values per admin level and HAPI metadata for all datasets
|
|
1190
1206
|
"""
|
|
1191
1207
|
if not names:
|
|
1192
1208
|
names = self.scrapers.keys()
|
|
1193
|
-
results =
|
|
1209
|
+
results = {}
|
|
1194
1210
|
|
|
1195
1211
|
def add_results(scraper_level, scrap, levels_used):
|
|
1196
1212
|
nonlocal results
|
|
@@ -1201,11 +1217,21 @@ class Runner:
|
|
|
1201
1217
|
if headers is None:
|
|
1202
1218
|
return
|
|
1203
1219
|
values = scrap.get_values(scraper_level)
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1220
|
+
hapi_dataset_metadata = scrap.get_hapi_dataset_metadata()
|
|
1221
|
+
hapi_resource_metadata = scrap.get_hapi_resource_metadata()
|
|
1222
|
+
dataset_id = hapi_dataset_metadata["hdx_id"]
|
|
1223
|
+
hapi_metadata = results.get(
|
|
1224
|
+
dataset_id, copy(hapi_dataset_metadata)
|
|
1225
|
+
)
|
|
1226
|
+
level_results = hapi_metadata.get("results", {})
|
|
1227
|
+
level_results[scraper_level] = {
|
|
1228
|
+
"headers": headers,
|
|
1229
|
+
"values": values,
|
|
1230
|
+
"hapi_resource_metadata": hapi_resource_metadata,
|
|
1231
|
+
}
|
|
1232
|
+
hapi_metadata["results"] = level_results
|
|
1207
1233
|
levels_used.add(scraper_level)
|
|
1208
|
-
results
|
|
1234
|
+
results[dataset_id] = hapi_metadata
|
|
1209
1235
|
|
|
1210
1236
|
for name in names:
|
|
1211
1237
|
if self.scrapers_to_run and not any(
|
hdx/scraper/utilities/reader.py
CHANGED
|
@@ -340,9 +340,16 @@ class Read(Retrieve):
|
|
|
340
340
|
def read_hdx_metadata(
|
|
341
341
|
self, datasetinfo: Dict, do_resource_check: bool = True
|
|
342
342
|
) -> Optional[Resource]:
|
|
343
|
-
"""Read metadata from HDX dataset and add to input dictionary. If url
|
|
344
|
-
supplied, will look through resources for one that matches
|
|
345
|
-
use its url unless do_resource_check is False.
|
|
343
|
+
"""Read metadata from HDX dataset and add to input dictionary. If url
|
|
344
|
+
is not supplied, will look through resources for one that matches
|
|
345
|
+
specified format and use its url unless do_resource_check is False.
|
|
346
|
+
The dataset key of the parameter datasetinfo will usually point to a
|
|
347
|
+
string (single dataset) but where sources vary across HXL tags can be
|
|
348
|
+
a dictionary that maps from HXL tags to datasets with the key
|
|
349
|
+
default_dataset setting a default for HXL tags. For a single dataset,
|
|
350
|
+
the keys hapi_dataset_metadata and hapi_resource_metadata will be
|
|
351
|
+
populated with more detailed dataset and resource information required
|
|
352
|
+
by HAPI.
|
|
346
353
|
|
|
347
354
|
Args:
|
|
348
355
|
datasetinfo (Dict): Dictionary of information about dataset
|
|
@@ -354,7 +361,9 @@ class Read(Retrieve):
|
|
|
354
361
|
dataset_nameinfo = datasetinfo["dataset"]
|
|
355
362
|
if isinstance(dataset_nameinfo, str):
|
|
356
363
|
dataset = self.read_dataset(dataset_nameinfo)
|
|
357
|
-
|
|
364
|
+
datasetinfo[
|
|
365
|
+
"hapi_dataset_metadata"
|
|
366
|
+
] = self.get_hapi_dataset_metadata(dataset)
|
|
358
367
|
resource = None
|
|
359
368
|
url = datasetinfo.get("url")
|
|
360
369
|
if do_resource_check and not url:
|
|
@@ -365,8 +374,8 @@ class Read(Retrieve):
|
|
|
365
374
|
if resource_name and resource["name"] != resource_name:
|
|
366
375
|
continue
|
|
367
376
|
url = resource["url"]
|
|
368
|
-
|
|
369
|
-
"
|
|
377
|
+
datasetinfo[
|
|
378
|
+
"hapi_resource_metadata"
|
|
370
379
|
] = self.get_hapi_resource_metadata(resource)
|
|
371
380
|
break
|
|
372
381
|
if not url:
|
|
@@ -374,7 +383,6 @@ class Read(Retrieve):
|
|
|
374
383
|
f"Cannot find {format} resource in {dataset_nameinfo}!"
|
|
375
384
|
)
|
|
376
385
|
datasetinfo["url"] = url
|
|
377
|
-
datasetinfo["hapi_metadata"] = hapi_metadata
|
|
378
386
|
if "source_date" not in datasetinfo:
|
|
379
387
|
datasetinfo[
|
|
380
388
|
"source_date"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hdx-python-scraper
|
|
3
|
-
Version: 2.1
|
|
3
|
+
Version: 2.2.1
|
|
4
4
|
Summary: HDX Python scraper utilities to assemble data from multiple sources
|
|
5
5
|
Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
|
|
6
6
|
Author-email: Michael Rans <rans@email.com>
|
|
@@ -26,12 +26,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Requires-Dist: gspread
|
|
29
|
-
Requires-Dist: hdx-python-api>=6.1.
|
|
29
|
+
Requires-Dist: hdx-python-api>=6.1.3
|
|
30
30
|
Requires-Dist: regex
|
|
31
31
|
Provides-Extra: dev
|
|
32
32
|
Requires-Dist: pre-commit; extra == 'dev'
|
|
33
33
|
Provides-Extra: pandas
|
|
34
|
-
Requires-Dist: pandas>=2.
|
|
34
|
+
Requires-Dist: pandas>=2.1.1; extra == 'pandas'
|
|
35
35
|
Provides-Extra: test
|
|
36
36
|
Requires-Dist: pytest; extra == 'test'
|
|
37
37
|
Requires-Dist: pytest-cov; extra == 'test'
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
|
|
2
|
-
hdx/scraper/_version.py,sha256=
|
|
3
|
-
hdx/scraper/base_scraper.py,sha256=
|
|
4
|
-
hdx/scraper/runner.py,sha256
|
|
2
|
+
hdx/scraper/_version.py,sha256=R_Wr7clGXr8a07n6uqFj88MyYFGydFRXYBI10R9k_uw,411
|
|
3
|
+
hdx/scraper/base_scraper.py,sha256=IaUDqnrSxB0kbEQynX-81NEyv9DLxypWKwEDAEr9GWg,14628
|
|
4
|
+
hdx/scraper/runner.py,sha256=-7L-L9WGZdTGl5mWNAPgvpTreU9bvbdxklruGCRzjRs,51217
|
|
5
5
|
hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
|
|
7
7
|
hdx/scraper/configurable/resource_downloader.py,sha256=vK8zNFy7T_Rj1h8Tj676-3B2oYYXFNKsrM9dxz7RZC8,1537
|
|
8
8
|
hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
|
|
9
|
-
hdx/scraper/configurable/scraper.py,sha256=
|
|
9
|
+
hdx/scraper/configurable/scraper.py,sha256=TyB7ipTzhVpOC3in0ZBIMwbcTAOR0Ul-W6Np85NnogI,20468
|
|
10
10
|
hdx/scraper/configurable/timeseries.py,sha256=uhnENo7Wsy0-YVjglm7OQkXI72-te61DkepkihbQrP8,2982
|
|
11
11
|
hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
|
|
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
|
|
|
15
15
|
hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
|
|
16
16
|
hdx/scraper/utilities/__init__.py,sha256=iBjD7bc8wEzQhwkcx2mOZwYmu28VHjl5px66quqWJ8E,2491
|
|
17
17
|
hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
|
|
18
|
-
hdx/scraper/utilities/reader.py,sha256=
|
|
18
|
+
hdx/scraper/utilities/reader.py,sha256=awm24AUWlweJmJVE1h0iid7xb6njvF7Taf0afbGXIG4,18331
|
|
19
19
|
hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
|
|
20
20
|
hdx/scraper/utilities/sources.py,sha256=h27PjBADqIhqDwmhzMXt1OjwJWZc2iVnIBwJuAJKHwo,11204
|
|
21
21
|
hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
|
|
22
|
-
hdx_python_scraper-2.1.
|
|
23
|
-
hdx_python_scraper-2.1.
|
|
24
|
-
hdx_python_scraper-2.1.
|
|
25
|
-
hdx_python_scraper-2.1.
|
|
22
|
+
hdx_python_scraper-2.2.1.dist-info/METADATA,sha256=hnYCmTG7ZlGqfc4QKCHjBKSesZ2q7ooTbdtyAuuhkqs,3289
|
|
23
|
+
hdx_python_scraper-2.2.1.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
|
|
24
|
+
hdx_python_scraper-2.2.1.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
|
|
25
|
+
hdx_python_scraper-2.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|