hdx-python-scraper 2.1.9__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdx/scraper/_version.py CHANGED
@@ -1,4 +1,16 @@
1
1
  # file generated by setuptools_scm
2
2
  # don't change, don't track in version control
3
- __version__ = version = '2.1.9'
4
- __version_tuple__ = version_tuple = (2, 1, 9)
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '2.2.1'
16
+ __version_tuple__ = version_tuple = (2, 2, 1)
@@ -346,14 +346,23 @@ class BaseScraper(ABC):
346
346
  """
347
347
  return self.source_urls
348
348
 
349
- def get_hapi_metadata(self) -> Optional[Dict]:
349
+ def get_hapi_dataset_metadata(self) -> Optional[Dict]:
350
350
  """
351
- Get HAPI metadata
351
+ Get HAPI dataset metadata
352
352
 
353
353
  Returns:
354
- Optional[Dict]: HAPI metadata
354
+ Optional[Dict]: HAPI dataset metadata
355
355
  """
356
- return self.datasetinfo.get("hapi_metadata")
356
+ return self.datasetinfo.get("hapi_dataset_metadata")
357
+
358
+ def get_hapi_resource_metadata(self) -> Optional[Dict]:
359
+ """
360
+ Get HAPI resource metadata
361
+
362
+ Returns:
363
+ Optional[Dict]: HAPI resource metadata
364
+ """
365
+ return self.datasetinfo.get("hapi_resource_metadata")
357
366
 
358
367
  def add_population(self) -> None:
359
368
  """
@@ -122,6 +122,7 @@ class ConfigurableScraper(BaseScraper):
122
122
  "input": datasetinfo.get("input", []),
123
123
  "transform": datasetinfo.get("transform", {}),
124
124
  "population_key": datasetinfo.get("population_key"),
125
+ "list": datasetinfo.get("list", []),
125
126
  "process": datasetinfo.get("process", []),
126
127
  "input_keep": datasetinfo.get("input_keep", []),
127
128
  "input_append": datasetinfo.get("input_append", []),
@@ -292,6 +293,7 @@ class ConfigurableScraper(BaseScraper):
292
293
  filter = subset["filter"]
293
294
  input_ignore_vals = subset.get("input_ignore_vals", [])
294
295
  input_transforms = subset.get("transform", {})
296
+ list_cols = subset.get("list")
295
297
  sum_cols = subset.get("sum")
296
298
  process_cols = subset.get("process")
297
299
  input_append = subset.get("input_append", [])
@@ -304,6 +306,8 @@ class ConfigurableScraper(BaseScraper):
304
306
  val = eval(input_transform.replace(valcol, "val"))
305
307
  if sum_cols or process_cols:
306
308
  dict_of_lists_add(valuedict, adm, val)
309
+ elif list_cols and valcol in list_cols:
310
+ dict_of_lists_add(valuedict, adm, val)
307
311
  else:
308
312
  curval = valuedict.get(adm)
309
313
  if valcol in input_append:
@@ -326,6 +330,7 @@ class ConfigurableScraper(BaseScraper):
326
330
  population_str = "self.population_lookup[adm]"
327
331
  else:
328
332
  population_str = "self.population_lookup[population_key]"
333
+ subset.get("list")
329
334
  process_cols = subset.get("process")
330
335
  input_keep = subset.get("input_keep", [])
331
336
  sum_cols = subset.get("sum")
@@ -440,7 +445,7 @@ class ConfigurableScraper(BaseScraper):
440
445
  valcols[i], f"newvaldicts[{i}][adm]"
441
446
  )
442
447
  formula = formula.replace("#pzbgvjh", population_str)
443
- for adm in valdicts[0].keys():
448
+ for adm in valdicts[0]:
444
449
  try:
445
450
  val = eval(formula)
446
451
  except (ValueError, TypeError, KeyError):
hdx/scraper/runner.py CHANGED
@@ -1146,51 +1146,67 @@ class Runner:
1146
1146
 
1147
1147
  def get_hapi_metadata(
1148
1148
  self, names: Optional[ListTuple[str]] = None
1149
- ) -> List[Dict]:
1150
- """Get HAPI metadata for all datasets
1149
+ ) -> Dict:
1150
+ """Get HAPI metadata for all datasets. A dictionary is returned that
1151
+ maps from dataset ids to a dictionary. The dictionary has keys for
1152
+ dataset metadata and a key resources under which is a dictionary that
1153
+ maps from resource ids to resource metadata.
1151
1154
 
1152
1155
  Args:
1153
1156
  names (Optional[ListTuple[str]]): Names of scrapers
1154
1157
 
1155
1158
  Returns:
1156
- List[Dict]: HAPI metadata for all datasets
1159
+ Dict: HAPI metadata for all datasets
1157
1160
  """
1158
1161
  if not names:
1159
1162
  names = self.scrapers.keys()
1160
- hapi_metadata_list = []
1163
+ results = {}
1161
1164
  for name in names:
1162
1165
  scraper = self.get_scraper(name)
1163
1166
  if not scraper.has_run:
1164
1167
  continue
1165
- hapi_metadata = scraper.get_hapi_metadata()
1166
- if hapi_metadata:
1167
- hapi_metadata_list.append(hapi_metadata)
1168
- return hapi_metadata_list
1168
+ hapi_dataset_metadata = scraper.get_hapi_dataset_metadata()
1169
+ hapi_resource_metadata = scraper.get_hapi_resource_metadata()
1170
+ dataset_id = hapi_dataset_metadata["hdx_id"]
1171
+ resource_id = hapi_resource_metadata["hdx_id"]
1172
+ hapi_metadata = results.get(
1173
+ dataset_id, copy(hapi_dataset_metadata)
1174
+ )
1175
+ hapi_resources = hapi_metadata.get("resources", {})
1176
+ hapi_resources[resource_id] = hapi_resource_metadata
1177
+ hapi_metadata["resources"] = hapi_resources
1178
+ results[dataset_id] = hapi_metadata
1179
+ return results
1169
1180
 
1170
1181
  def get_hapi_results(
1171
1182
  self,
1172
1183
  names: Optional[ListTuple[str]] = None,
1173
1184
  has_run: bool = True,
1174
- ) -> List[Dict]:
1175
- """Get the results (headers, values and HAPi metadata) for scrapers
1176
- limiting to those in names if given and limiting further to those that
1177
- have been set in the constructor if previously given. By default only
1178
- scrapers marked as having run are returned unless has_run is set to
1179
- False. A list of dictionaries is returned where each dictionary has
1180
- keys headers, values, HAPI metadata and fallbacks. Headers is
1181
- a tuple of (column headers, hxl hashtags). Values, sources and
1182
- fallbacks are all lists.
1185
+ ) -> Dict:
1186
+ """Get the results (headers and values per admin level and HAPI
1187
+ metadata) for scrapers limiting to those in names if given and limiting
1188
+ further to those that have been set in the constructor if previously
1189
+ given. By default, only scrapers marked as having run are returned
1190
+ unless has_run is set to False.
1191
+
1192
+ A dictionary is returned where key is HDX dataset id and value is a
1193
+ dictionary that has HAPI dataset metadata as well as a results key.
1194
+ The value associated with the results key is a dictionary where each
1195
+ key is an admin level. Each admin level key has a value dictionary with
1196
+ headers, values and HAPI resource metadata. Headers is a tuple of
1197
+ (column headers, hxl hashtags). Values is a list. HAPI resource
1198
+ metadata is a dictionary.
1183
1199
 
1184
1200
  Args:
1185
1201
  names (Optional[ListTuple[str]]): Names of scrapers. Defaults to None (all scrapers).
1186
1202
  has_run (bool): Only get results for scrapers marked as having run. Defaults to True.
1187
1203
 
1188
1204
  Returns:
1189
- List[Dict]: Headers, values and HAPI metadata for all datasets
1205
+ Dict: Headers and values per admin level and HAPI metadata for all datasets
1190
1206
  """
1191
1207
  if not names:
1192
1208
  names = self.scrapers.keys()
1193
- results = []
1209
+ results = {}
1194
1210
 
1195
1211
  def add_results(scraper_level, scrap, levels_used):
1196
1212
  nonlocal results
@@ -1201,11 +1217,21 @@ class Runner:
1201
1217
  if headers is None:
1202
1218
  return
1203
1219
  values = scrap.get_values(scraper_level)
1204
- hapi_metadata = copy(scrap.get_hapi_metadata())
1205
- hapi_metadata["headers"] = headers
1206
- hapi_metadata["values"] = values
1220
+ hapi_dataset_metadata = scrap.get_hapi_dataset_metadata()
1221
+ hapi_resource_metadata = scrap.get_hapi_resource_metadata()
1222
+ dataset_id = hapi_dataset_metadata["hdx_id"]
1223
+ hapi_metadata = results.get(
1224
+ dataset_id, copy(hapi_dataset_metadata)
1225
+ )
1226
+ level_results = hapi_metadata.get("results", {})
1227
+ level_results[scraper_level] = {
1228
+ "headers": headers,
1229
+ "values": values,
1230
+ "hapi_resource_metadata": hapi_resource_metadata,
1231
+ }
1232
+ hapi_metadata["results"] = level_results
1207
1233
  levels_used.add(scraper_level)
1208
- results.append(hapi_metadata)
1234
+ results[dataset_id] = hapi_metadata
1209
1235
 
1210
1236
  for name in names:
1211
1237
  if self.scrapers_to_run and not any(
@@ -340,9 +340,16 @@ class Read(Retrieve):
340
340
  def read_hdx_metadata(
341
341
  self, datasetinfo: Dict, do_resource_check: bool = True
342
342
  ) -> Optional[Resource]:
343
- """Read metadata from HDX dataset and add to input dictionary. If url is not
344
- supplied, will look through resources for one that matches specified format and
345
- use its url unless do_resource_check is False.
343
+ """Read metadata from HDX dataset and add to input dictionary. If url
344
+ is not supplied, will look through resources for one that matches
345
+ specified format and use its url unless do_resource_check is False.
346
+ The dataset key of the parameter datasetinfo will usually point to a
347
+ string (single dataset) but where sources vary across HXL tags can be
348
+ a dictionary that maps from HXL tags to datasets with the key
349
+ default_dataset setting a default for HXL tags. For a single dataset,
350
+ the keys hapi_dataset_metadata and hapi_resource_metadata will be
351
+ populated with more detailed dataset and resource information required
352
+ by HAPI.
346
353
 
347
354
  Args:
348
355
  datasetinfo (Dict): Dictionary of information about dataset
@@ -354,7 +361,9 @@ class Read(Retrieve):
354
361
  dataset_nameinfo = datasetinfo["dataset"]
355
362
  if isinstance(dataset_nameinfo, str):
356
363
  dataset = self.read_dataset(dataset_nameinfo)
357
- hapi_metadata = self.get_hapi_dataset_metadata(dataset)
364
+ datasetinfo[
365
+ "hapi_dataset_metadata"
366
+ ] = self.get_hapi_dataset_metadata(dataset)
358
367
  resource = None
359
368
  url = datasetinfo.get("url")
360
369
  if do_resource_check and not url:
@@ -365,8 +374,8 @@ class Read(Retrieve):
365
374
  if resource_name and resource["name"] != resource_name:
366
375
  continue
367
376
  url = resource["url"]
368
- hapi_metadata[
369
- "resource"
377
+ datasetinfo[
378
+ "hapi_resource_metadata"
370
379
  ] = self.get_hapi_resource_metadata(resource)
371
380
  break
372
381
  if not url:
@@ -374,7 +383,6 @@ class Read(Retrieve):
374
383
  f"Cannot find {format} resource in {dataset_nameinfo}!"
375
384
  )
376
385
  datasetinfo["url"] = url
377
- datasetinfo["hapi_metadata"] = hapi_metadata
378
386
  if "source_date" not in datasetinfo:
379
387
  datasetinfo[
380
388
  "source_date"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hdx-python-scraper
3
- Version: 2.1.9
3
+ Version: 2.2.1
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -26,12 +26,12 @@ Classifier: Programming Language :: Python :: 3.12
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
- Requires-Dist: hdx-python-api>=6.1.1
29
+ Requires-Dist: hdx-python-api>=6.1.3
30
30
  Requires-Dist: regex
31
31
  Provides-Extra: dev
32
32
  Requires-Dist: pre-commit; extra == 'dev'
33
33
  Provides-Extra: pandas
34
- Requires-Dist: pandas>=2.0.3; extra == 'pandas'
34
+ Requires-Dist: pandas>=2.1.1; extra == 'pandas'
35
35
  Provides-Extra: test
36
36
  Requires-Dist: pytest; extra == 'test'
37
37
  Requires-Dist: pytest-cov; extra == 'test'
@@ -1,12 +1,12 @@
1
1
  hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/_version.py,sha256=MpnaybiTOEFk89HrKfy8TfZn_07O2pxn6coEIcViaoI,160
3
- hdx/scraper/base_scraper.py,sha256=o_r8xl8piArJrNan3RsN-BV9HDqchcDQUQeYKwcw4vg,14345
4
- hdx/scraper/runner.py,sha256=MZt8Omk-lUpUOzKmXnra77Ljze-xwZ-crFltYdFAIts,49796
2
+ hdx/scraper/_version.py,sha256=R_Wr7clGXr8a07n6uqFj88MyYFGydFRXYBI10R9k_uw,411
3
+ hdx/scraper/base_scraper.py,sha256=IaUDqnrSxB0kbEQynX-81NEyv9DLxypWKwEDAEr9GWg,14628
4
+ hdx/scraper/runner.py,sha256=-7L-L9WGZdTGl5mWNAPgvpTreU9bvbdxklruGCRzjRs,51217
5
5
  hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
7
7
  hdx/scraper/configurable/resource_downloader.py,sha256=vK8zNFy7T_Rj1h8Tj676-3B2oYYXFNKsrM9dxz7RZC8,1537
8
8
  hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
9
- hdx/scraper/configurable/scraper.py,sha256=kVQpVMHEYizSq94PeSP119gi24a1XR_mzL4mtlAvK2M,20217
9
+ hdx/scraper/configurable/scraper.py,sha256=TyB7ipTzhVpOC3in0ZBIMwbcTAOR0Ul-W6Np85NnogI,20468
10
10
  hdx/scraper/configurable/timeseries.py,sha256=uhnENo7Wsy0-YVjglm7OQkXI72-te61DkepkihbQrP8,2982
11
11
  hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
15
15
  hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
16
16
  hdx/scraper/utilities/__init__.py,sha256=iBjD7bc8wEzQhwkcx2mOZwYmu28VHjl5px66quqWJ8E,2491
17
17
  hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
- hdx/scraper/utilities/reader.py,sha256=qYL5jTkhBOmZBe5AwA_7B2KTefSwlVkvGhvXAaOlaJA,17850
18
+ hdx/scraper/utilities/reader.py,sha256=awm24AUWlweJmJVE1h0iid7xb6njvF7Taf0afbGXIG4,18331
19
19
  hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
20
  hdx/scraper/utilities/sources.py,sha256=h27PjBADqIhqDwmhzMXt1OjwJWZc2iVnIBwJuAJKHwo,11204
21
21
  hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
22
- hdx_python_scraper-2.1.9.dist-info/METADATA,sha256=de89qL7_O7_63htWg_72uKBe4cJ9oz_Zsf8PZEmGSVc,3289
23
- hdx_python_scraper-2.1.9.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
24
- hdx_python_scraper-2.1.9.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
- hdx_python_scraper-2.1.9.dist-info/RECORD,,
22
+ hdx_python_scraper-2.2.1.dist-info/METADATA,sha256=hnYCmTG7ZlGqfc4QKCHjBKSesZ2q7ooTbdtyAuuhkqs,3289
23
+ hdx_python_scraper-2.2.1.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
24
+ hdx_python_scraper-2.2.1.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
+ hdx_python_scraper-2.2.1.dist-info/RECORD,,