hdx-python-scraper 2.3.0__py3-none-any.whl → 2.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdx/scraper/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2.3.0'
16
- __version_tuple__ = version_tuple = (2, 3, 0)
15
+ __version__ = version = '2.3.2'
16
+ __version_tuple__ = version_tuple = (2, 3, 2)
@@ -16,6 +16,7 @@ class BaseScraper(ABC):
16
16
  datasetinfo (Dict): Information about dataset
17
17
  headers (Dict[str, Tuple]): Headers to be oytput at each level_name
18
18
  source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
19
+ reader (str): Reader to use. Defaults to "" (datasetinfo reader falling back on name).
19
20
  """
20
21
 
21
22
  population_lookup = {}
@@ -26,15 +27,20 @@ class BaseScraper(ABC):
26
27
  datasetinfo: Dict,
27
28
  headers: Dict[str, Tuple],
28
29
  source_configuration: Dict = {},
30
+ reader: str = "",
29
31
  ) -> None:
30
- self.setup(name, headers, source_configuration)
32
+ self.name = name
33
+ if reader:
34
+ self.reader = reader
35
+ else:
36
+ self.reader = datasetinfo.get("reader", name)
37
+ self.setup(headers, source_configuration)
31
38
  self.datasetinfo = deepcopy(datasetinfo)
32
39
  self.errors_on_exit = None
33
40
  self.can_fallback = True
34
41
 
35
42
  def setup(
36
43
  self,
37
- name: str,
38
44
  headers: Dict[str, Tuple],
39
45
  source_configuration: Dict = {},
40
46
  ) -> None:
@@ -42,14 +48,12 @@ class BaseScraper(ABC):
42
48
  {"national": (("School Closure",), ("#impact+type",)), ...},
43
49
 
44
50
  Args:
45
- name (str): Name of scraper
46
51
  headers (Dict[str, Tuple]): Headers to be output at each level_name
47
52
  source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
48
53
 
49
54
  Returns:
50
55
  None
51
56
  """
52
- self.name = name
53
57
  self.headers = headers
54
58
  self.initialise_values_sources(source_configuration)
55
59
  self.has_run = False
@@ -92,7 +96,7 @@ class BaseScraper(ABC):
92
96
  None
93
97
  """
94
98
  if not name:
95
- name = self.name
99
+ name = self.reader
96
100
  reader = Read.get_reader(name)
97
101
  return reader
98
102
 
@@ -361,10 +365,16 @@ class BaseScraper(ABC):
361
365
  return None
362
366
  if "is_hxl" in hapi_resource_metadata:
363
367
  return hapi_resource_metadata
364
- reader = self.get_reader(self.name)
368
+ reader = self.get_reader()
365
369
  filename = self.datasetinfo.get("filename")
370
+ file_prefix = self.datasetinfo.get("file_prefix", self.name)
371
+ if filename:
372
+ kwargs = {"filename": filename}
373
+ else:
374
+ kwargs = {"file_prefix": file_prefix}
366
375
  hxl_info = reader.hxl_info_hapi_resource_metadata(
367
- hapi_resource_metadata, filename=filename, file_prefix=self.name
376
+ hapi_resource_metadata,
377
+ **kwargs,
368
378
  )
369
379
  is_hxl = False
370
380
  if hxl_info:
@@ -70,6 +70,8 @@ class ConfigurableScraper(BaseScraper):
70
70
  errors_on_exit: Optional[ErrorsOnExit] = None,
71
71
  **kwargs: Any,
72
72
  ):
73
+ self.name = name
74
+ self.reader = datasetinfo.get("reader", name)
73
75
  self.level = level
74
76
  datelevel = datasetinfo.get("date_level")
75
77
  if datelevel is None:
@@ -98,11 +100,11 @@ class ConfigurableScraper(BaseScraper):
98
100
  use_hxl = self.datasetinfo.get("use_hxl", False)
99
101
  if use_hxl:
100
102
  try:
101
- file_headers, iterator = self.get_iterator(name)
103
+ file_headers, iterator = self.get_iterator()
102
104
  self.use_hxl(headers, file_headers, iterator)
103
105
  except DownloadError:
104
106
  self.can_fallback = False
105
- self.setup(name, headers, source_configuration)
107
+ self.setup(headers, source_configuration)
106
108
 
107
109
  @staticmethod
108
110
  def get_subsets_from_datasetinfo(datasetinfo: Dict) -> List[Dict]:
@@ -136,20 +138,18 @@ class ConfigurableScraper(BaseScraper):
136
138
  ]
137
139
  return subsets
138
140
 
139
- def get_iterator(self, name: str) -> Tuple[List[str], Iterator[Dict]]:
140
- """Get the iterator from the preconfigured reader for the given scraper name
141
-
142
- Args:
143
- name (str): Name of scraper
141
+ def get_iterator(self) -> Tuple[List[str], Iterator[Dict]]:
142
+ """Get the iterator from the preconfigured reader for this scraper
144
143
 
145
144
  Returns:
146
145
  Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
147
146
  """
148
- return self.get_reader(name).read(
149
- self.datasetinfo,
150
- file_prefix=name,
151
- **self.variables,
152
- )
147
+ if (
148
+ "filename" not in self.datasetinfo
149
+ and "file_prefix" not in self.datasetinfo
150
+ ):
151
+ self.datasetinfo["file_prefix"] = self.name
152
+ return self.get_reader().read(self.datasetinfo, **self.variables)
153
153
 
154
154
  def add_sources(self) -> None:
155
155
  """Add source for each HXL hashtag
@@ -466,7 +466,7 @@ class ConfigurableScraper(BaseScraper):
466
466
  Returns:
467
467
  None
468
468
  """
469
- file_headers, iterator = self.get_iterator(self.name)
469
+ file_headers, iterator = self.get_iterator()
470
470
  header_to_hxltag = self.use_hxl(None, file_headers, iterator)
471
471
  if "source_url" not in self.datasetinfo:
472
472
  self.datasetinfo["source_url"] = self.datasetinfo["url"]
@@ -50,7 +50,7 @@ class TimeSeries(BaseScraper):
50
50
  "output_hxl"
51
51
  ]
52
52
  rows = [headers, hxltags]
53
- file_headers, iterator = self.get_reader(self.name).read(
53
+ file_headers, iterator = self.get_reader().read(
54
54
  self.datasetinfo, file_prefix=self.name
55
55
  )
56
56
  for inrow in iterator:
hdx/scraper/runner.py CHANGED
@@ -10,7 +10,7 @@ from .configurable.resource_downloader import ResourceDownloader
10
10
  from .configurable.scraper import ConfigurableScraper
11
11
  from .configurable.timeseries import TimeSeries
12
12
  from .outputs.base import BaseOutput
13
- from .utilities import get_startend_dates_from_reference_period
13
+ from .utilities import get_startend_dates_from_time_period
14
14
  from .utilities.fallbacks import Fallbacks
15
15
  from .utilities.reader import Read
16
16
  from .utilities.sources import Sources
@@ -29,7 +29,7 @@ class Runner:
29
29
  countryiso3s (ListTuple[str]): List of ISO3 country codes to process
30
30
  today (datetime): Value to use for today. Defaults to now_utc().
31
31
  errors_on_exit (ErrorsOnExit): ErrorsOnExit object that logs errors on exit
32
- scrapers_to_run (Optional[ListTuple[str]]): Scrapers to run. Defaults to None.
32
+ scrapers_to_run (Optional[ListTuple[str]]): Scrapers to run. Defaults to None (all scrapers).
33
33
  """
34
34
 
35
35
  def __init__(
@@ -1061,7 +1061,7 @@ class Runner:
1061
1061
  if dataset_name:
1062
1062
  dataset = reader.read_dataset(dataset_name)
1063
1063
  if date is None:
1064
- date = get_startend_dates_from_reference_period(
1064
+ date = get_startend_dates_from_time_period(
1065
1065
  dataset, today=self.today
1066
1066
  )
1067
1067
  if source_name is None:
@@ -1209,32 +1209,52 @@ class Runner:
1209
1209
  """
1210
1210
  if not names:
1211
1211
  names = self.scrapers.keys()
1212
- results = {}
1212
+ hapi_results = {}
1213
1213
 
1214
1214
  def add_results(scraper_level, scrap, levels_used):
1215
- nonlocal results
1215
+ nonlocal hapi_results
1216
1216
 
1217
1217
  if scraper_level in levels_used:
1218
1218
  return
1219
1219
  headers = scrap.headers.get(scraper_level)
1220
1220
  if headers is None:
1221
1221
  return
1222
+ headings = headers[0]
1223
+ hxltags = headers[1]
1222
1224
  values = scrap.get_values(scraper_level)
1223
1225
  hapi_dataset_metadata = scrap.get_hapi_dataset_metadata()
1226
+ if not hapi_dataset_metadata:
1227
+ return
1224
1228
  hapi_resource_metadata = scrap.get_hapi_resource_metadata()
1229
+ if not hapi_resource_metadata:
1230
+ return
1225
1231
  dataset_id = hapi_dataset_metadata["hdx_id"]
1226
- hapi_metadata = results.get(
1232
+ hapi_metadata = hapi_results.get(
1227
1233
  dataset_id, copy(hapi_dataset_metadata)
1228
1234
  )
1229
- level_results = hapi_metadata.get("results", {})
1230
- level_results[scraper_level] = {
1231
- "headers": headers,
1232
- "values": values,
1233
- "hapi_resource_metadata": hapi_resource_metadata,
1234
- }
1235
- hapi_metadata["results"] = level_results
1235
+ results = hapi_metadata.get("results", {})
1236
+ level_results = results.get(scraper_level)
1237
+ if level_results is None:
1238
+ level_results = {
1239
+ "headers": ([], []),
1240
+ "values": [],
1241
+ "hapi_resource_metadata": hapi_resource_metadata,
1242
+ }
1243
+ results[scraper_level] = level_results
1244
+ lev_headings = level_results["headers"][0]
1245
+ lev_hxltags = level_results["headers"][1]
1246
+ lev_values = level_results["values"]
1247
+ for i, hxltag in enumerate(hxltags):
1248
+ if hxltag in lev_hxltags:
1249
+ index = lev_hxltags.index(hxltag)
1250
+ lev_values[index].update(values[i])
1251
+ else:
1252
+ lev_headings.append(headings[i])
1253
+ lev_hxltags.append(hxltag)
1254
+ lev_values.append(values[i])
1255
+ hapi_metadata["results"] = results
1236
1256
  levels_used.add(scraper_level)
1237
- results[dataset_id] = hapi_metadata
1257
+ hapi_results[dataset_id] = hapi_metadata
1238
1258
 
1239
1259
  for name in names:
1240
1260
  if self.scrapers_to_run and not any(
@@ -1247,4 +1267,4 @@ class Runner:
1247
1267
  lvls_used = set()
1248
1268
  for scrap_level in scraper.headers:
1249
1269
  add_results(scrap_level, scraper, lvls_used)
1250
- return results
1270
+ return hapi_results
@@ -57,22 +57,22 @@ def get_rowval(row: Dict, valcol: str) -> Any:
57
57
  return result
58
58
 
59
59
 
60
- def get_startend_dates_from_reference_period(
60
+ def get_startend_dates_from_time_period(
61
61
  dataset: Dataset, today: Optional[datetime] = None
62
62
  ) -> Optional[Dict]:
63
- """Return the reference period in form required for source_date
63
+ """Return the time period in form required for source_date
64
64
 
65
65
  Args:
66
66
  dataset (Dataset): Dataset object
67
67
  today (Optional[datetime]): Date to use for today. Defaults to None (datetime.utcnow)
68
68
 
69
69
  Returns:
70
- Optional[Dict]: Reference period in form required for source_date
70
+ Optional[Dict]: Time period in form required for source_date
71
71
  """
72
72
  if today is None:
73
- date_info = dataset.get_reference_period()
73
+ date_info = dataset.get_time_period()
74
74
  else:
75
- date_info = dataset.get_reference_period(today=today)
75
+ date_info = dataset.get_time_period(today=today)
76
76
  startdate = date_info.get("startdate")
77
77
  enddate = date_info.get("enddate")
78
78
  if enddate is None:
@@ -8,7 +8,7 @@ import hxl
8
8
  from hxl.input import InputOptions, munge_url
9
9
  from slugify import slugify
10
10
 
11
- from . import get_startend_dates_from_reference_period, match_template
11
+ from . import get_startend_dates_from_time_period, match_template
12
12
  from .sources import Sources
13
13
  from hdx.data.dataset import Dataset
14
14
  from hdx.data.resource import Resource
@@ -224,6 +224,13 @@ class Read(Retrieve):
224
224
  filename = datasetinfo.get("filename")
225
225
  if filename:
226
226
  kwargs["filename"] = filename
227
+ if filename:
228
+ # remove file_prefix if filename provided
229
+ kwargs.pop("file_prefix", None)
230
+ elif "file_prefix" not in kwargs:
231
+ file_prefix = datasetinfo.get("file_prefix")
232
+ if file_prefix:
233
+ kwargs["file_prefix"] = file_prefix
227
234
  return self.get_tabular_rows(
228
235
  url,
229
236
  dict_form=True,
@@ -311,7 +318,7 @@ class Read(Retrieve):
311
318
  """
312
319
  return self.construct_filename_and_download(
313
320
  resource["name"],
314
- resource.get_file_type(),
321
+ resource.get_format(),
315
322
  resource["url"],
316
323
  **kwargs,
317
324
  )
@@ -331,7 +338,7 @@ class Read(Retrieve):
331
338
  "title": dataset["title"],
332
339
  "hdx_provider_stub": dataset["organization"]["name"],
333
340
  "hdx_provider_name": dataset["organization"]["title"],
334
- "reference_period": dataset.get_reference_period(today=self.today),
341
+ "reference_period": dataset.get_time_period(today=self.today),
335
342
  }
336
343
 
337
344
  @staticmethod
@@ -477,7 +484,7 @@ class Read(Retrieve):
477
484
  if "source_date" not in datasetinfo:
478
485
  datasetinfo[
479
486
  "source_date"
480
- ] = get_startend_dates_from_reference_period(
487
+ ] = get_startend_dates_from_time_period(
481
488
  dataset, today=self.today
482
489
  )
483
490
  if "source" not in datasetinfo:
@@ -510,7 +517,7 @@ class Read(Retrieve):
510
517
  key = "default_date"
511
518
  else:
512
519
  key = hxltag
513
- source_date[key] = get_startend_dates_from_reference_period(
520
+ source_date[key] = get_startend_dates_from_time_period(
514
521
  dataset, today=self.today
515
522
  )
516
523
  if source is not None:
@@ -555,10 +562,14 @@ class Read(Retrieve):
555
562
  datasetinfo["filename"] = filename
556
563
  filename = datasetinfo.get("filename")
557
564
  if resource and not filename:
558
- # prefix is added later
559
565
  filename = self.construct_filename(
560
- resource["name"], resource.get_file_type()
566
+ resource["name"], resource.get_format()
561
567
  )
568
+ file_prefix = kwargs.get("file_prefix")
569
+ if not file_prefix:
570
+ file_prefix = datasetinfo.get("file_prefix")
571
+ if file_prefix:
572
+ filename = f"{file_prefix}_{filename}"
562
573
  datasetinfo["filename"] = filename
563
574
  return self.read_tabular(datasetinfo, **kwargs)
564
575
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hdx-python-scraper
3
- Version: 2.3.0
3
+ Version: 2.3.2
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -26,7 +26,7 @@ Classifier: Programming Language :: Python :: 3.12
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
- Requires-Dist: hdx-python-api>=6.1.4
29
+ Requires-Dist: hdx-python-api>=6.2.0
30
30
  Requires-Dist: hdx-python-country>=3.6.3
31
31
  Requires-Dist: regex
32
32
  Provides-Extra: dev
@@ -1,25 +1,25 @@
1
1
  hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/_version.py,sha256=ChsIHG8bRc-eXUbXOgv4Fm4DstSKLq9FpsTAsaMeR08,411
3
- hdx/scraper/base_scraper.py,sha256=OZoC8X3woecKbMxTtjx_aRr027SeJCS2gbtyB20n31o,15079
4
- hdx/scraper/runner.py,sha256=fojFcfEh3mZXe1dY3Jpis22dr9Zc6VY-0XTMiabuXFE,51366
2
+ hdx/scraper/_version.py,sha256=aKqtdIqWETcZnGj_9koZ-EQK7itBfKLMIKY7ucdTIMI,411
3
+ hdx/scraper/base_scraper.py,sha256=oo9oMqCUpK8_hPwcTz2PAKabzoyU0BQu5dgWgsFa55Y,15431
4
+ hdx/scraper/runner.py,sha256=KIEVLSJwEw9fzQxqsN92c50yDG3CRYAVDO7A6Zv_KJY,52262
5
5
  hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
7
7
  hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
8
8
  hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
9
- hdx/scraper/configurable/scraper.py,sha256=kBkS-bm4zIQ9jbzFcwVoAnyji_9PTV_KKrNJVLTuYa4,20498
10
- hdx/scraper/configurable/timeseries.py,sha256=lWoQJApml-onTN4l9YnTAYnhj5uuTc-Luk05DIT7O9k,3036
9
+ hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
10
+ hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
11
11
  hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
13
13
  hdx/scraper/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
14
14
  hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1Ep39QY,3087
15
15
  hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
16
- hdx/scraper/utilities/__init__.py,sha256=iBjD7bc8wEzQhwkcx2mOZwYmu28VHjl5px66quqWJ8E,2491
16
+ hdx/scraper/utilities/__init__.py,sha256=1IaNOMhAxjGRDKUHSM_ENFcPRn0vw499K9iTX4LvCS0,2466
17
17
  hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
- hdx/scraper/utilities/reader.py,sha256=9cXrk8_NrE4kHIm3wrM3KHgKX6bho_eCyibMDBairiU,21499
18
+ hdx/scraper/utilities/reader.py,sha256=HaR0da1my59P1T4sYe15GwX5cf5m4UbMo1r2uR9yvP8,21963
19
19
  hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
20
  hdx/scraper/utilities/sources.py,sha256=h27PjBADqIhqDwmhzMXt1OjwJWZc2iVnIBwJuAJKHwo,11204
21
21
  hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
22
- hdx_python_scraper-2.3.0.dist-info/METADATA,sha256=E5b13txhk44RjnOSKJu_SkaypNFXxe5YDLUBCWKA7Pk,3318
23
- hdx_python_scraper-2.3.0.dist-info/WHEEL,sha256=0wCxn4rnLsvRWBK-NC7mK2QMIQ_aZSl7Qvk-8IWl_pY,87
24
- hdx_python_scraper-2.3.0.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
- hdx_python_scraper-2.3.0.dist-info/RECORD,,
22
+ hdx_python_scraper-2.3.2.dist-info/METADATA,sha256=cDyJQpQAf7U486xjcUYFbyoVm2fPLZPu8mLMG7GWVMU,3318
23
+ hdx_python_scraper-2.3.2.dist-info/WHEEL,sha256=mRYSEL3Ih6g5a_CVMIcwiF__0Ae4_gLYh01YFNwiq1k,87
24
+ hdx_python_scraper-2.3.2.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
+ hdx_python_scraper-2.3.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.19.1
2
+ Generator: hatchling 1.21.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any