hdx-python-scraper 2.3.5__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. hdx/scraper/{__init__.py → framework/__init__.py} +0 -0
  2. hdx/scraper/{_version.py → framework/_version.py} +2 -2
  3. hdx/scraper/{base_scraper.py → framework/base_scraper.py} +4 -4
  4. hdx/scraper/{outputs → framework/outputs}/googlesheets.py +1 -1
  5. hdx/scraper/{runner.py → framework/runner.py} +12 -12
  6. hdx/scraper/{configurable/scraper.py → framework/scrapers/configurable_scraper.py} +5 -5
  7. hdx/scraper/{configurable → framework/scrapers}/rowparser.py +58 -23
  8. hdx/scraper/{utilities → framework/utilities}/reader.py +93 -22
  9. hdx/scraper/framework/utilities/sector.py +63 -0
  10. hdx/scraper/framework/utilities/sector_configuration.yaml +138 -0
  11. hdx/scraper/{utilities → framework/utilities}/sources.py +3 -3
  12. {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.5.2.dist-info}/METADATA +6 -5
  13. hdx_python_scraper-2.5.2.dist-info/RECORD +27 -0
  14. {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.5.2.dist-info}/WHEEL +1 -1
  15. hdx_python_scraper-2.3.5.dist-info/RECORD +0 -25
  16. /hdx/scraper/{configurable → framework/outputs}/__init__.py +0 -0
  17. /hdx/scraper/{outputs → framework/outputs}/base.py +0 -0
  18. /hdx/scraper/{outputs → framework/outputs}/excelfile.py +0 -0
  19. /hdx/scraper/{outputs → framework/outputs}/json.py +0 -0
  20. /hdx/scraper/{outputs → framework/scrapers}/__init__.py +0 -0
  21. /hdx/scraper/{configurable → framework/scrapers}/aggregator.py +0 -0
  22. /hdx/scraper/{configurable → framework/scrapers}/resource_downloader.py +0 -0
  23. /hdx/scraper/{configurable → framework/scrapers}/timeseries.py +0 -0
  24. /hdx/scraper/{utilities → framework/utilities}/__init__.py +0 -0
  25. /hdx/scraper/{utilities → framework/utilities}/fallbacks.py +0 -0
  26. /hdx/scraper/{utilities → framework/utilities}/region_lookup.py +0 -0
  27. /hdx/scraper/{utilities → framework/utilities}/writer.py +0 -0
  28. {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.5.2.dist-info}/licenses/LICENSE +0 -0
File without changes
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2.3.5'
16
- __version_tuple__ = version_tuple = (2, 3, 5)
15
+ __version__ = version = '2.5.2'
16
+ __version_tuple__ = version_tuple = (2, 5, 2)
@@ -36,7 +36,7 @@ class BaseScraper(ABC):
36
36
  self.reader = datasetinfo.get("reader", name)
37
37
  self.setup(headers, source_configuration)
38
38
  self.datasetinfo = deepcopy(datasetinfo)
39
- self.errors_on_exit = None
39
+ self.error_handler = None
40
40
  self.can_fallback = True
41
41
 
42
42
  def setup(
@@ -141,9 +141,9 @@ class BaseScraper(ABC):
141
141
  "should_overwrite_sources"
142
142
  )
143
143
  if should_overwrite_sources is not None:
144
- self.source_configuration[
145
- "should_overwrite_sources"
146
- ] = should_overwrite_sources
144
+ self.source_configuration["should_overwrite_sources"] = (
145
+ should_overwrite_sources
146
+ )
147
147
  source = self.datasetinfo["source"]
148
148
  if isinstance(source, str):
149
149
  source = {"default_source": source}
@@ -91,4 +91,4 @@ class GoogleSheets(BaseOutput):
91
91
  df.fillna("NaN", inplace=True)
92
92
  rows.extend(df.values.tolist())
93
93
  values = rows
94
- tab.update("A1", values)
94
+ tab.update(values, "A1")
@@ -5,18 +5,18 @@ from traceback import format_exc
5
5
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from .base_scraper import BaseScraper
8
- from .configurable.aggregator import Aggregator
9
- from .configurable.resource_downloader import ResourceDownloader
10
- from .configurable.scraper import ConfigurableScraper
11
- from .configurable.timeseries import TimeSeries
12
8
  from .outputs.base import BaseOutput
9
+ from .scrapers.aggregator import Aggregator
10
+ from .scrapers.configurable_scraper import ConfigurableScraper
11
+ from .scrapers.resource_downloader import ResourceDownloader
12
+ from .scrapers.timeseries import TimeSeries
13
13
  from .utilities import get_startend_dates_from_time_period
14
14
  from .utilities.fallbacks import Fallbacks
15
15
  from .utilities.reader import Read
16
16
  from .utilities.sources import Sources
17
17
  from hdx.location.adminlevel import AdminLevel
18
18
  from hdx.utilities.dateparse import now_utc
19
- from hdx.utilities.errors_onexit import ErrorsOnExit
19
+ from hdx.utilities.error_handler import ErrorHandler
20
20
  from hdx.utilities.typehint import ListTuple
21
21
 
22
22
  logger = logging.getLogger(__name__)
@@ -28,7 +28,7 @@ class Runner:
28
28
  Args:
29
29
  countryiso3s (ListTuple[str]): List of ISO3 country codes to process
30
30
  today (datetime): Value to use for today. Defaults to now_utc().
31
- errors_on_exit (ErrorsOnExit): ErrorsOnExit object that logs errors on exit
31
+ error_handler (ErrorHandler): ErrorHandler object that logs errors on exit
32
32
  scrapers_to_run (Optional[ListTuple[str]]): Scrapers to run. Defaults to None (all scrapers).
33
33
  """
34
34
 
@@ -36,12 +36,12 @@ class Runner:
36
36
  self,
37
37
  countryiso3s: ListTuple[str],
38
38
  today: datetime = now_utc(),
39
- errors_on_exit: Optional[ErrorsOnExit] = None,
39
+ error_handler: Optional[ErrorHandler] = None,
40
40
  scrapers_to_run: Optional[ListTuple[str]] = None,
41
41
  ):
42
42
  self.countryiso3s = countryiso3s
43
43
  self.today = today
44
- self.errors_on_exit = errors_on_exit
44
+ self.error_handler = error_handler
45
45
  if isinstance(scrapers_to_run, tuple):
46
46
  scrapers_to_run = list(scrapers_to_run)
47
47
  self.scrapers_to_run: Optional[List[str]] = scrapers_to_run
@@ -73,7 +73,7 @@ class Runner:
73
73
  and scraper_name not in self.scrapers_to_run
74
74
  ):
75
75
  self.scrapers_to_run.append(scraper_name)
76
- scraper.errors_on_exit = self.errors_on_exit
76
+ scraper.error_handler = self.error_handler
77
77
  return scraper_name
78
78
 
79
79
  def add_customs(
@@ -142,7 +142,7 @@ class Runner:
142
142
  level_name,
143
143
  source_configuration,
144
144
  self.today,
145
- self.errors_on_exit,
145
+ self.error_handler,
146
146
  )
147
147
  if scraper_name not in self.scraper_names:
148
148
  self.scraper_names.append(scraper_name)
@@ -612,8 +612,8 @@ class Runner:
612
612
  if not Fallbacks.exist() or scraper.can_fallback is False:
613
613
  raise
614
614
  logger.exception(f"Using fallbacks for {scraper.name}!")
615
- if self.errors_on_exit:
616
- self.errors_on_exit.add(
615
+ if self.error_handler:
616
+ self.error_handler.add(
617
617
  f"Using fallbacks for {scraper.name}! Error: {format_exc()}"
618
618
  )
619
619
  for level in scraper.headers.keys():
@@ -17,7 +17,7 @@ from hdx.utilities.dateparse import (
17
17
  )
18
18
  from hdx.utilities.dictandlist import dict_of_lists_add
19
19
  from hdx.utilities.downloader import DownloadError
20
- from hdx.utilities.errors_onexit import ErrorsOnExit
20
+ from hdx.utilities.error_handler import ErrorHandler
21
21
  from hdx.utilities.text import ( # noqa: F401
22
22
  get_fraction_str,
23
23
  get_numeric_if_possible,
@@ -42,7 +42,7 @@ class ConfigurableScraper(BaseScraper):
42
42
  level_name (Optional[str]): Customised level_name name. Defaults to None (level).
43
43
  source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
44
44
  today (datetime): Value to use for today. Defaults to now_utc().
45
- errors_on_exit (Optional[ErrorsOnExit]): ErrorsOnExit object that logs errors on exit
45
+ error_handler (Optional[ErrorHandler]): ErrorHandler object that logs errors on exit
46
46
  **kwargs: Variables to use when evaluating template arguments in urls
47
47
  """
48
48
 
@@ -67,7 +67,7 @@ class ConfigurableScraper(BaseScraper):
67
67
  level_name: Optional[str] = None,
68
68
  source_configuration: Dict = {},
69
69
  today: datetime = now_utc(),
70
- errors_on_exit: Optional[ErrorsOnExit] = None,
70
+ error_handler: Optional[ErrorHandler] = None,
71
71
  **kwargs: Any,
72
72
  ):
73
73
  self.name = name
@@ -83,10 +83,10 @@ class ConfigurableScraper(BaseScraper):
83
83
  else:
84
84
  self.level_name: str = level_name
85
85
  self.countryiso3s = countryiso3s
86
- self.adminlevel = adminlevel
86
+ self.adminlevel: Optional[AdminLevel] = adminlevel
87
87
  self.today = today
88
88
  self.subsets = self.get_subsets_from_datasetinfo(datasetinfo)
89
- self.errors_on_exit: Optional[ErrorsOnExit] = errors_on_exit
89
+ self.error_handler: Optional[ErrorHandler] = error_handler
90
90
  self.variables = kwargs
91
91
  self.rowparser = None
92
92
  self.datasetinfo = copy.deepcopy(datasetinfo)
@@ -185,20 +185,14 @@ class RowParser:
185
185
  Returns:
186
186
  Iterator[Dict]: Input data with prefilter applied if specified and sorted if specified or deemed necessary
187
187
  """
188
- rows = []
189
- for row in iterator:
190
- if self.header_to_hxltag:
191
- newrow = {}
192
- for header in row:
193
- newrow[self.header_to_hxltag[header]] = row[header]
194
- row = newrow
195
- if self.stop_row:
196
- if all(
197
- row[key] == value for key, value in self.stop_row.items()
198
- ):
199
- break
200
- for newrow in self.flatten(row):
201
- rows.append(newrow)
188
+ if self.header_to_hxltag:
189
+ iterator = self.header_to_hxltag_rows(iterator)
190
+ if self.stop_row:
191
+ iterator = self.stop_rows(iterator)
192
+ if self.flatteninfo:
193
+ iterator = self.flatten_rows(iterator)
194
+ if self.prefilter:
195
+ iterator = (row for row in iterator if eval(self.prefilter))
202
196
  if not self.sort:
203
197
  if self.datecol:
204
198
  for subset in self.subsets:
@@ -212,15 +206,59 @@ class RowParser:
212
206
  )
213
207
  self.sort = {"keys": [self.datecol], "reverse": True}
214
208
  break
215
- if self.prefilter:
216
- rows = [row for row in rows if eval(self.prefilter)]
217
209
  if self.sort:
218
210
  keys = self.sort["keys"]
219
211
  reverse = self.sort.get("reverse", False)
220
- rows = sorted(rows, key=itemgetter(*keys), reverse=reverse)
221
- return rows
212
+ iterator = sorted(iterator, key=itemgetter(*keys), reverse=reverse)
213
+ return iterator
214
+
215
+ def header_to_hxltag_rows(
216
+ self, iterator: Iterator[Dict]
217
+ ) -> Generator[Dict, None, None]:
218
+ """Convert headers to HXL tags in keys
219
+
220
+ Args:
221
+ iterator (Iterator[Dict]): Input data
222
+
223
+ Returns:
224
+ Generator[Dict]: Rows where keys are HXL tags
225
+ """
226
+ for row in iterator:
227
+ newrow = {}
228
+ for header in row:
229
+ newrow[self.header_to_hxltag[header]] = row[header]
230
+ yield newrow
231
+
232
+ def stop_rows(
233
+ self, iterator: Iterator[Dict]
234
+ ) -> Generator[Dict, None, None]:
235
+ """Stop processing rows after condition met
236
+
237
+ Args:
238
+ iterator (Iterator[Dict]): Input data
239
+
240
+ Returns:
241
+ Generator[Dict]: Rows up to stop condition
242
+ """
243
+ for row in iterator:
244
+ if all(row[key] == value for key, value in self.stop_row.items()):
245
+ break
246
+ yield row
247
+
248
+ def flatten_rows(self, iterator: Iterator[Dict]) -> Iterator[Dict]:
249
+ """Flatten rows
250
+
251
+ Args:
252
+ iterator (Iterator[Dict]): Input data
253
+
254
+ Returns:
255
+ Generator[Dict]: Flattened rows
256
+ """
257
+ for row in iterator:
258
+ for newrow in self.flatten_row(row):
259
+ yield newrow
222
260
 
223
- def flatten(self, row: Dict) -> Generator[Dict, None, None]:
261
+ def flatten_row(self, row: Dict) -> Generator[Dict, None, None]:
224
262
  """Flatten a wide spreadsheet format into a long one
225
263
 
226
264
  Args:
@@ -229,9 +267,6 @@ class RowParser:
229
267
  Returns:
230
268
  Generator[Dict]: Flattened row(s)
231
269
  """
232
- if not self.flatteninfo:
233
- yield row
234
- return
235
270
  counters = [-1 for _ in self.flatteninfo]
236
271
  while True:
237
272
  newrow = copy.deepcopy(row)
@@ -314,7 +349,7 @@ class RowParser:
314
349
  adms[i], exact = Country.get_iso3_country_code_fuzzy(adm)
315
350
  elif i == 1:
316
351
  adms[i], exact = self.adminlevel.get_pcode(
317
- adms[0], adm, self.name
352
+ adms[0], adm, logname=self.name
318
353
  )
319
354
  if adms[i] not in self.adms[i]:
320
355
  adms[i] = None
@@ -1,3 +1,4 @@
1
+ import glob
1
2
  import logging
2
3
  from datetime import datetime
3
4
  from os.path import join
@@ -10,6 +11,7 @@ from slugify import slugify
10
11
 
11
12
  from . import get_startend_dates_from_time_period, match_template
12
13
  from .sources import Sources
14
+ from hdx.api.configuration import Configuration
13
15
  from hdx.data.dataset import Dataset
14
16
  from hdx.data.resource import Resource
15
17
  from hdx.utilities.dateparse import parse_date
@@ -204,15 +206,19 @@ class Read(Retrieve):
204
206
  if headers is None:
205
207
  headers = 1
206
208
  datasetinfo["headers"] = 1
207
- kwargs["headers"] = headers
208
- if isinstance(headers, list):
209
- kwargs["fill_merged_cells"] = True
210
209
  format = datasetinfo["format"]
211
210
  kwargs["format"] = format
212
- if not sheet and format in ("xls", "xlsx"):
213
- sheet = 1
211
+ if format in ("xls", "xlsx"):
212
+ if not sheet:
213
+ sheet = 1
214
+ if isinstance(headers, list):
215
+ kwargs["fill_merged_cells"] = True
216
+ elif "fill_merged_cells" not in kwargs:
217
+ kwargs["fill_merged_cells"] = False
218
+ kwargs["xlsx2csv"] = datasetinfo.get("xlsx2csv", False)
214
219
  if sheet:
215
220
  kwargs["sheet"] = sheet
221
+ kwargs["headers"] = headers
216
222
  compression = datasetinfo.get("compression")
217
223
  if compression:
218
224
  kwargs["compression"] = compression
@@ -238,11 +244,14 @@ class Read(Retrieve):
238
244
  **kwargs,
239
245
  )
240
246
 
241
- def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
247
+ def read_dataset(
248
+ self, dataset_name: str, configuration: Optional[Configuration] = None
249
+ ) -> Optional[Dataset]:
242
250
  """Read HDX dataset
243
251
 
244
252
  Args:
245
253
  dataset_name (str): Dataset name
254
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
246
255
 
247
256
  Returns:
248
257
  Optional[Dataset]: The dataset that was read or None
@@ -252,7 +261,7 @@ class Read(Retrieve):
252
261
  logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
253
262
  dataset = Dataset.load_from_json(saved_path)
254
263
  else:
255
- dataset = Dataset.read_from_hdx(dataset_name)
264
+ dataset = Dataset.read_from_hdx(dataset_name, configuration)
256
265
  if self.save:
257
266
  logger.info(f"Saving dataset {dataset_name} in {saved_path}")
258
267
  if dataset is None:
@@ -261,6 +270,56 @@ class Read(Retrieve):
261
270
  dataset.save_to_json(saved_path, follow_urls=True)
262
271
  return dataset
263
272
 
273
+ def search_datasets(
274
+ self,
275
+ filename: str,
276
+ query: Optional[str] = "*:*",
277
+ configuration: Optional[Configuration] = None,
278
+ page_size: int = 1000,
279
+ **kwargs: Any,
280
+ ) -> List[Dataset]:
281
+ """Read HDX dataset
282
+
283
+ Args:
284
+ filename (str): Filename for saved files. Will be prefixed by underscore and a number.
285
+ query (Optional[str]): Query (in Solr format). Defaults to '*:*'.
286
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
287
+ page_size (int): Size of page to return. Defaults to 1000.
288
+ **kwargs: See below
289
+ fq (string): Any filter queries to apply
290
+ rows (int): Number of matching rows to return. Defaults to all datasets (sys.maxsize).
291
+ start (int): Offset in the complete result for where the set of returned datasets should begin
292
+ sort (string): Sorting of results. Defaults to 'relevance asc, metadata_modified desc' if rows<=page_size or 'metadata_modified asc' if rows>page_size.
293
+ facet (string): Whether to enable faceted results. Default to True.
294
+ facet.mincount (int): Minimum counts for facet fields should be included in the results
295
+ facet.limit (int): Maximum number of values the facet fields return (- = unlimited). Defaults to 50.
296
+ facet.field (List[str]): Fields to facet upon. Default is empty.
297
+ use_default_schema (bool): Use default package schema instead of custom schema. Defaults to False.
298
+
299
+ Returns:
300
+ List[Dataset]: list of datasets resulting from query
301
+ """
302
+
303
+ saved_path = join(self.saved_dir, filename)
304
+ if self.use_saved:
305
+ logger.info(
306
+ f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
307
+ )
308
+ datasets = []
309
+ for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
310
+ datasets.append(Dataset.load_from_json(file_path))
311
+ else:
312
+ datasets = Dataset.search_in_hdx(
313
+ query, configuration, page_size, **kwargs
314
+ )
315
+ if self.save:
316
+ for i, dataset in enumerate(datasets):
317
+ file_path = f"{saved_path}_{i}.json"
318
+ name = dataset["name"]
319
+ logger.info(f"Saving dataset {name} in {file_path}")
320
+ dataset.save_to_json(file_path, follow_urls=True)
321
+ return datasets
322
+
264
323
  @staticmethod
265
324
  def construct_filename(name: str, format: str):
266
325
  """Construct filename from name and format. The filename of the file
@@ -438,7 +497,10 @@ class Read(Retrieve):
438
497
  return self.hxl_info_file(name, format, url, **kwargs)
439
498
 
440
499
  def read_hdx_metadata(
441
- self, datasetinfo: Dict, do_resource_check: bool = True
500
+ self,
501
+ datasetinfo: Dict,
502
+ do_resource_check: bool = True,
503
+ configuration: Optional[Configuration] = None,
442
504
  ) -> Optional[Resource]:
443
505
  """Read metadata from HDX dataset and add to input dictionary. If url
444
506
  is not supplied, will look through resources for one that matches
@@ -454,13 +516,14 @@ class Read(Retrieve):
454
516
  Args:
455
517
  datasetinfo (Dict): Dictionary of information about dataset
456
518
  do_resource_check (bool): Whether to check resources. Defaults to False.
519
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
457
520
 
458
521
  Returns:
459
522
  Optional[Resource]: The resource if a url was not given
460
523
  """
461
524
  dataset_nameinfo = datasetinfo["dataset"]
462
525
  if isinstance(dataset_nameinfo, str):
463
- dataset = self.read_dataset(dataset_nameinfo)
526
+ dataset = self.read_dataset(dataset_nameinfo, configuration)
464
527
  resource = None
465
528
  url = datasetinfo.get("url")
466
529
  resource_name = datasetinfo.get("resource")
@@ -491,24 +554,24 @@ class Read(Retrieve):
491
554
  else:
492
555
  url = resource["url"] # otherwise set the url key in
493
556
  # datasetinfo to the resource url (by setting url here)
494
- datasetinfo[
495
- "hapi_resource_metadata"
496
- ] = self.get_hapi_resource_metadata(resource)
557
+ datasetinfo["hapi_resource_metadata"] = (
558
+ self.get_hapi_resource_metadata(resource)
559
+ )
497
560
  datasetinfo["url"] = url
498
561
  if "source_date" not in datasetinfo:
499
- datasetinfo[
500
- "source_date"
501
- ] = get_startend_dates_from_time_period(
502
- dataset, today=self.today
562
+ datasetinfo["source_date"] = (
563
+ get_startend_dates_from_time_period(
564
+ dataset, today=self.today
565
+ )
503
566
  )
504
567
  if "source" not in datasetinfo:
505
568
  datasetinfo["source"] = dataset["dataset_source"]
506
569
  if "source_url" not in datasetinfo:
507
570
  datasetinfo["source_url"] = dataset.get_hdx_url()
508
571
  Sources.standardise_datasetinfo_source_date(datasetinfo)
509
- datasetinfo[
510
- "hapi_dataset_metadata"
511
- ] = self.get_hapi_dataset_metadata(dataset, datasetinfo)
572
+ datasetinfo["hapi_dataset_metadata"] = (
573
+ self.get_hapi_dataset_metadata(dataset, datasetinfo)
574
+ )
512
575
  return resource
513
576
 
514
577
  if "source_date" not in datasetinfo:
@@ -527,7 +590,7 @@ class Read(Retrieve):
527
590
  for hxltag, dataset_name in dataset_nameinfo.items():
528
591
  dataset = datasets.get(dataset_name)
529
592
  if not dataset:
530
- dataset = self.read_dataset(dataset_name)
593
+ dataset = self.read_dataset(dataset_name, configuration)
531
594
  datasets[dataset_name] = dataset
532
595
  if source_date is not None:
533
596
  if hxltag == "default_dataset":
@@ -561,18 +624,22 @@ class Read(Retrieve):
561
624
  def read_hdx(
562
625
  self,
563
626
  datasetinfo: Dict,
627
+ configuration: Optional[Configuration] = None,
564
628
  **kwargs: Any,
565
629
  ) -> Tuple[List[str], Iterator[Dict]]:
566
630
  """Read data and metadata from HDX dataset
567
631
 
568
632
  Args:
569
633
  datasetinfo (Dict): Dictionary of information about dataset
634
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
570
635
  **kwargs: Parameters to pass to download_file call
571
636
 
572
637
  Returns:
573
638
  Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
574
639
  """
575
- resource = self.read_hdx_metadata(datasetinfo)
640
+ resource = self.read_hdx_metadata(
641
+ datasetinfo, configuration=configuration
642
+ )
576
643
  filename = kwargs.get("filename")
577
644
  if filename:
578
645
  del kwargs["filename"]
@@ -593,12 +660,14 @@ class Read(Retrieve):
593
660
  def read(
594
661
  self,
595
662
  datasetinfo: Dict,
663
+ configuration: Optional[Configuration] = None,
596
664
  **kwargs: Any,
597
665
  ) -> Tuple[List[str], Iterator[Dict]]:
598
666
  """Read data and metadata from HDX dataset
599
667
 
600
668
  Args:
601
669
  datasetinfo (Dict): Dictionary of information about dataset
670
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
602
671
  **kwargs: Parameters to pass to download_file call
603
672
 
604
673
  Returns:
@@ -607,7 +676,9 @@ class Read(Retrieve):
607
676
  format = datasetinfo["format"]
608
677
  if format in ["json", "csv", "xls", "xlsx"]:
609
678
  if "dataset" in datasetinfo:
610
- headers, iterator = self.read_hdx(datasetinfo, **kwargs)
679
+ headers, iterator = self.read_hdx(
680
+ datasetinfo, configuration, **kwargs
681
+ )
611
682
  else:
612
683
  headers, iterator = self.read_tabular(datasetinfo, **kwargs)
613
684
  else:
@@ -0,0 +1,63 @@
1
+ """Populate the sector mapping."""
2
+
3
+ import logging
4
+ from copy import copy
5
+ from typing import Dict, Optional
6
+
7
+ from .reader import Read
8
+ from hdx.utilities.loader import load_yaml
9
+ from hdx.utilities.matching import get_code_from_name
10
+ from hdx.utilities.path import script_dir_plus_file
11
+ from hdx.utilities.text import normalise
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class Sector:
17
+ def __init__(
18
+ self,
19
+ configuration: Optional[Dict] = None,
20
+ ):
21
+ if configuration is None:
22
+ configuration = load_yaml(
23
+ script_dir_plus_file("sector_configuration.yaml", Sector)
24
+ )
25
+ self._datasetinfo = configuration["sector"]
26
+ self.data = copy(configuration["sector_map"])
27
+ self.unmatched = []
28
+ self.populate()
29
+
30
+ def populate(self) -> None:
31
+ logger.info("Populating sector mapping")
32
+
33
+ def parse_sector_values(code: str, name: str):
34
+ self.data[name] = code
35
+ self.data[code] = code
36
+ self.data[normalise(name)] = code
37
+ self.data[normalise(code)] = code
38
+
39
+ reader = Read.get_reader()
40
+ headers, iterator = reader.read(
41
+ self._datasetinfo, file_prefix="sector"
42
+ )
43
+ for row in iterator:
44
+ parse_sector_values(
45
+ code=row["#sector +code +acronym"],
46
+ name=row["#sector +name +preferred +i_en"],
47
+ )
48
+
49
+ extra_entries = {
50
+ "Cash": "Cash programming",
51
+ "Hum": "Humanitarian assistance (unspecified)",
52
+ "Multi": "Multi-sector (unspecified)",
53
+ "Intersectoral": "Intersectoral",
54
+ }
55
+ for code, name in extra_entries.items():
56
+ parse_sector_values(code=code, name=name)
57
+
58
+ def get_sector_code(self, sector: str) -> str | None:
59
+ return get_code_from_name(
60
+ name=sector,
61
+ code_lookup=self.data,
62
+ unmatched=self.unmatched,
63
+ )
@@ -0,0 +1,138 @@
1
+ sector:
2
+ dataset: "global-coordination-groups-beta"
3
+ resource: "Global Coordination Groups (Beta) CSV"
4
+ format: "csv"
5
+ headers: 2
6
+
7
+ sector_map:
8
+ abna: "SHL"
9
+ abri: "SHL"
10
+ abri bna: "SHL"
11
+ abris: "SHL"
12
+ abris ame: "SHL"
13
+ abris bna: "SHL"
14
+ abris bna cccm: "SHL"
15
+ abris durgence et nfi: "SHL"
16
+ abris nfi: "SHL"
17
+ action contre les mines: "PRO-MIN"
18
+ aee: "SHL"
19
+ agriculture: "FSC"
20
+ agua saneamiento e higiene: "WSH"
21
+ all: "Intersectoral"
22
+ alojamiento de emergencia: "SHL"
23
+ alojamiento de emergencia shelter: "SHL"
24
+ alojamientos y asentamientos: "SHL"
25
+ ame: "SHL"
26
+ ash: "WSH"
27
+ assainissement: "WSH"
28
+ camp coordination and camp management: "CCM"
29
+ camp coordination camp management: "CCM"
30
+ cash: "Cash"
31
+ cccm: "CCM"
32
+ ccs: "CCM"
33
+ cluster coordination: "CCM"
34
+ coord services support: "CCM"
35
+ coordinacion informacion: "CCM"
36
+ coordination: "CCM"
37
+ coordination et gestion des camps: "CCM"
38
+ eah: "WSH"
39
+ eau: "WSH"
40
+ eau assainissement et hygiene: "WSH"
41
+ eau hygiene: "WSH"
42
+ eau hygiene assainissement: "WSH"
43
+ eau hygiene et assainissement: "WSH"
44
+ educacion: "EDU"
45
+ educacion en emergencias: "EDU"
46
+ education: "EDU"
47
+ eha: "WSH"
48
+ emergency shelter and non food items: "SHL"
49
+ epah: "WSH"
50
+ erl: "ERY"
51
+ esnfi: "SHL"
52
+ explosive hazards: "PRO-MIN"
53
+ food: "FSC"
54
+ food security and agriculture: "FSC"
55
+ food security and livelihoods: "FSC"
56
+ food security and nutrition: "FSC"
57
+ food security livelihood: "FSC"
58
+ fsl: "FSC"
59
+ gestion des sites daccueil temporaires: "SHL"
60
+ gbv: "PRO-GBV"
61
+ hlp: "PRO-HLP"
62
+ humanitaire: "Hum"
63
+ hygiene: "WSH"
64
+ hygiene assainissement: "WSH"
65
+ intercluster: "Multi" # From Somalia 3W, hopefully not to be confused with intersectoral
66
+ logement terre et biens: "PRO-HLP"
67
+ logistica: "LOG"
68
+ logistique: "LOG"
69
+ manejo y gestion de campamentos: "CCM"
70
+ ms: "Multi"
71
+ multi secteur: "Multi"
72
+ multisectoriel: "Multi"
73
+ nutricion: "NUT"
74
+ nutrition: "NUT"
75
+ operatioanl presence water sanitation hygiene: "WSH"
76
+ operational presence education in emergencies: "EDU"
77
+ operational presence emergency shelter non food items: "SHL"
78
+ operational presence food security agriculture: "FSC"
79
+ operational presence health: "HEA"
80
+ operational presence nutrition: "NUT"
81
+ operational presence protection: "PRO"
82
+ pro cpm: "PRO-CPN"
83
+ pronna: "PRO-CPN"
84
+ propg: "PRO"
85
+ proteccion infantil: "PRO-CPN"
86
+ protection: "PRO"
87
+ protection de lenfance: "PRO-CPN"
88
+ protection de lenfant: "PRO-CPN"
89
+ protection generale: "PRO"
90
+ protection logement terre et propriete: "PRO-HLP"
91
+ protection ltb: "PRO-HLP"
92
+ protection lutte anti mines: "PRO-MIN"
93
+ protection pe: "PRO-CPN"
94
+ protection protection de lenfant: "PRO-CPN"
95
+ protection violences basees sur le genre: "PRO-GBV"
96
+ protection vgb: "PRO-GBV"
97
+ proteccion: "PRO"
98
+ provbg: "PRO-GBV"
99
+ psea: "PRO-GBV"
100
+ rapid response mechanism: "ERY"
101
+ rcf: "CCM"
102
+ rcf education: "EDU"
103
+ rcf food security and livelihoods: "FSC"
104
+ rcf health and nutrtion: "HEA"
105
+ rcf protection: "PRO"
106
+ recuperacion temprana: "ERY"
107
+ relevement precoce: "ERY"
108
+ relevement rapide: "ERY"
109
+ refugee response: "CCM"
110
+ refugees migrants multi sector: "CCM"
111
+ reponse aux refugies: "CCM"
112
+ sa: "FSC"
113
+ sal: "HEA"
114
+ salud: "HEA"
115
+ samv: "FSC"
116
+ sante: "HEA"
117
+ securite alimentaire: "FSC"
118
+ seguridad alimentaria: "FSC"
119
+ seguridad alimentaria y nutricion: "FSC"
120
+ services humanitaires communs: "Hum"
121
+ sexual and reproductive health: "HEA"
122
+ shelter: "SHL"
123
+ shelter nfi: "SHL"
124
+ shelter nfis: "SHL"
125
+ shelter and nfi: "SHL"
126
+ shelter and nfis: "SHL"
127
+ shelter and non food items: "SHL"
128
+ site management: "CCM"
129
+ snfi: "SHL"
130
+ telecommunications: "TEL"
131
+ telecommunications durgence: "TEL"
132
+ telecomunicaciones de emergencia: "TEL"
133
+ vbg: "PRO-GBV"
134
+ violences basees sur le genre: "PRO-GBV"
135
+ violence basee sur le genre: "PRO-GBV"
136
+ violencia basada en genero: "PRO-GBV"
137
+ wash: "WSH"
138
+ water sanitation and hygiene: "WSH"
@@ -282,9 +282,9 @@ class Sources:
282
282
  if no_sources:
283
283
  source_configuration["no_sources"] = True
284
284
  return source_configuration
285
- source_configuration[
286
- "should_overwrite_sources"
287
- ] = should_overwrite_sources
285
+ source_configuration["should_overwrite_sources"] = (
286
+ should_overwrite_sources
287
+ )
288
288
  if suffix_attribute:
289
289
  source_configuration["suffix_attribute"] = suffix_attribute
290
290
  return source_configuration
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: hdx-python-scraper
3
- Version: 2.3.5
3
+ Version: 2.5.2
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -26,13 +26,14 @@ Classifier: Programming Language :: Python :: 3.12
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
- Requires-Dist: hdx-python-api>=6.2.1
30
- Requires-Dist: hdx-python-country>=3.6.4
29
+ Requires-Dist: hdx-python-api>=6.3.7
30
+ Requires-Dist: hdx-python-country>=3.8.6
31
+ Requires-Dist: hdx-python-utilities>=3.8.2
31
32
  Requires-Dist: regex
32
33
  Provides-Extra: dev
33
34
  Requires-Dist: pre-commit; extra == 'dev'
34
35
  Provides-Extra: pandas
35
- Requires-Dist: pandas>=2.1.3; extra == 'pandas'
36
+ Requires-Dist: pandas>=2.2.2; extra == 'pandas'
36
37
  Provides-Extra: test
37
38
  Requires-Dist: pytest; extra == 'test'
38
39
  Requires-Dist: pytest-cov; extra == 'test'
@@ -0,0 +1,27 @@
1
+ hdx/scraper/framework/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
+ hdx/scraper/framework/_version.py,sha256=qrwMUvCUqANtlUPbnE5wPCDZujNKWYOaJRJsJky27Ac,411
3
+ hdx/scraper/framework/base_scraper.py,sha256=vvwljQ5QWr6hpCjOS89RG1pvC955aLoPvm6pSovO75o,15432
4
+ hdx/scraper/framework/runner.py,sha256=GFnZM9HciZFibwwRgDHVk9F_y2n27ctpRwyeD1_ZcKw,53538
5
+ hdx/scraper/framework/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ hdx/scraper/framework/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
7
+ hdx/scraper/framework/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
8
+ hdx/scraper/framework/outputs/googlesheets.py,sha256=jLAfXz4usmLFrePxRIsMflxKPzSGv9T3jlMpSV-s4II,3087
9
+ hdx/scraper/framework/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
10
+ hdx/scraper/framework/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ hdx/scraper/framework/scrapers/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
12
+ hdx/scraper/framework/scrapers/configurable_scraper.py,sha256=PYPtU9XZALNx-2Jr8a8kVVDsT2j9yGgBaw6wXhztQIM,20612
13
+ hdx/scraper/framework/scrapers/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
14
+ hdx/scraper/framework/scrapers/rowparser.py,sha256=bH05JUqViIVes9T7gWp0D2778BlFiJuNHmdovSFdFoI,15614
15
+ hdx/scraper/framework/scrapers/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
16
+ hdx/scraper/framework/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
17
+ hdx/scraper/framework/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
+ hdx/scraper/framework/utilities/reader.py,sha256=pQcGg5TIhl3c-QX_F1sZxY4Ar0N7TLalX38IMuCXA-0,26568
19
+ hdx/scraper/framework/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
+ hdx/scraper/framework/utilities/sector.py,sha256=rl_TceRYc5YRoLccr0ABCM42ZLLtLzezWWWQ5YtbQDE,1947
21
+ hdx/scraper/framework/utilities/sector_configuration.yaml,sha256=LAUR5xfLU5qua5qtc3TcwEei0sD1zoCb_vfAxD7Grb8,3894
22
+ hdx/scraper/framework/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
23
+ hdx/scraper/framework/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
24
+ hdx_python_scraper-2.5.2.dist-info/METADATA,sha256=zNHR55fmxnxl_0K3u7zAl1rz1molJ7DX6meBJmK49Es,3361
25
+ hdx_python_scraper-2.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
26
+ hdx_python_scraper-2.5.2.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
27
+ hdx_python_scraper-2.5.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.21.1
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,25 +0,0 @@
1
- hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/_version.py,sha256=-9aYLvgAp04zL8yFAMPjvf6kLKgqW1mLgyuk6XA3LcE,411
3
- hdx/scraper/base_scraper.py,sha256=oo9oMqCUpK8_hPwcTz2PAKabzoyU0BQu5dgWgsFa55Y,15431
4
- hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
5
- hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
7
- hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
8
- hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
9
- hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
10
- hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
11
- hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
13
- hdx/scraper/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
14
- hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1Ep39QY,3087
15
- hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
16
- hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
17
- hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
- hdx/scraper/utilities/reader.py,sha256=hexLIJW3CdP4DmobqMM-Z2d6pgcCs1zWWBW-stqoeNU,22975
19
- hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
- hdx/scraper/utilities/sources.py,sha256=VNhFYSUM2xeDlN6y4Ya9_0BskjPtjwQZmCKnQgpOemQ,11511
21
- hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
22
- hdx_python_scraper-2.3.5.dist-info/METADATA,sha256=jYBTVEB111S1R3Cj8fZByzM4E3nRRKCr31bsCPstjPA,3318
23
- hdx_python_scraper-2.3.5.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
24
- hdx_python_scraper-2.3.5.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
- hdx_python_scraper-2.3.5.dist-info/RECORD,,
File without changes
File without changes
File without changes
File without changes