hdx-python-scraper 2.3.5__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. hdx/scraper/{__init__.py → framework/__init__.py} +0 -0
  2. hdx/scraper/{_version.py → framework/_version.py} +2 -2
  3. hdx/scraper/{base_scraper.py → framework/base_scraper.py} +4 -4
  4. hdx/scraper/{outputs → framework/outputs}/googlesheets.py +1 -1
  5. hdx/scraper/{runner.py → framework/runner.py} +12 -12
  6. hdx/scraper/{configurable/scraper.py → framework/scrapers/configurable_scraper.py} +5 -5
  7. hdx/scraper/{configurable → framework/scrapers}/rowparser.py +58 -23
  8. hdx/scraper/{utilities → framework/utilities}/reader.py +98 -22
  9. hdx/scraper/framework/utilities/sector.py +63 -0
  10. hdx/scraper/framework/utilities/sector_configuration.yaml +138 -0
  11. hdx/scraper/{utilities → framework/utilities}/sources.py +3 -3
  12. {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.5.3.dist-info}/METADATA +6 -5
  13. hdx_python_scraper-2.5.3.dist-info/RECORD +27 -0
  14. {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.5.3.dist-info}/WHEEL +1 -1
  15. hdx_python_scraper-2.3.5.dist-info/RECORD +0 -25
  16. /hdx/scraper/{configurable → framework/outputs}/__init__.py +0 -0
  17. /hdx/scraper/{outputs → framework/outputs}/base.py +0 -0
  18. /hdx/scraper/{outputs → framework/outputs}/excelfile.py +0 -0
  19. /hdx/scraper/{outputs → framework/outputs}/json.py +0 -0
  20. /hdx/scraper/{outputs → framework/scrapers}/__init__.py +0 -0
  21. /hdx/scraper/{configurable → framework/scrapers}/aggregator.py +0 -0
  22. /hdx/scraper/{configurable → framework/scrapers}/resource_downloader.py +0 -0
  23. /hdx/scraper/{configurable → framework/scrapers}/timeseries.py +0 -0
  24. /hdx/scraper/{utilities → framework/utilities}/__init__.py +0 -0
  25. /hdx/scraper/{utilities → framework/utilities}/fallbacks.py +0 -0
  26. /hdx/scraper/{utilities → framework/utilities}/region_lookup.py +0 -0
  27. /hdx/scraper/{utilities → framework/utilities}/writer.py +0 -0
  28. {hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.5.3.dist-info}/licenses/LICENSE +0 -0
File without changes
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2.3.5'
16
- __version_tuple__ = version_tuple = (2, 3, 5)
15
+ __version__ = version = '2.5.3'
16
+ __version_tuple__ = version_tuple = (2, 5, 3)
@@ -36,7 +36,7 @@ class BaseScraper(ABC):
36
36
  self.reader = datasetinfo.get("reader", name)
37
37
  self.setup(headers, source_configuration)
38
38
  self.datasetinfo = deepcopy(datasetinfo)
39
- self.errors_on_exit = None
39
+ self.error_handler = None
40
40
  self.can_fallback = True
41
41
 
42
42
  def setup(
@@ -141,9 +141,9 @@ class BaseScraper(ABC):
141
141
  "should_overwrite_sources"
142
142
  )
143
143
  if should_overwrite_sources is not None:
144
- self.source_configuration[
145
- "should_overwrite_sources"
146
- ] = should_overwrite_sources
144
+ self.source_configuration["should_overwrite_sources"] = (
145
+ should_overwrite_sources
146
+ )
147
147
  source = self.datasetinfo["source"]
148
148
  if isinstance(source, str):
149
149
  source = {"default_source": source}
@@ -91,4 +91,4 @@ class GoogleSheets(BaseOutput):
91
91
  df.fillna("NaN", inplace=True)
92
92
  rows.extend(df.values.tolist())
93
93
  values = rows
94
- tab.update("A1", values)
94
+ tab.update(values, "A1")
@@ -5,18 +5,18 @@ from traceback import format_exc
5
5
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from .base_scraper import BaseScraper
8
- from .configurable.aggregator import Aggregator
9
- from .configurable.resource_downloader import ResourceDownloader
10
- from .configurable.scraper import ConfigurableScraper
11
- from .configurable.timeseries import TimeSeries
12
8
  from .outputs.base import BaseOutput
9
+ from .scrapers.aggregator import Aggregator
10
+ from .scrapers.configurable_scraper import ConfigurableScraper
11
+ from .scrapers.resource_downloader import ResourceDownloader
12
+ from .scrapers.timeseries import TimeSeries
13
13
  from .utilities import get_startend_dates_from_time_period
14
14
  from .utilities.fallbacks import Fallbacks
15
15
  from .utilities.reader import Read
16
16
  from .utilities.sources import Sources
17
17
  from hdx.location.adminlevel import AdminLevel
18
18
  from hdx.utilities.dateparse import now_utc
19
- from hdx.utilities.errors_onexit import ErrorsOnExit
19
+ from hdx.utilities.error_handler import ErrorHandler
20
20
  from hdx.utilities.typehint import ListTuple
21
21
 
22
22
  logger = logging.getLogger(__name__)
@@ -28,7 +28,7 @@ class Runner:
28
28
  Args:
29
29
  countryiso3s (ListTuple[str]): List of ISO3 country codes to process
30
30
  today (datetime): Value to use for today. Defaults to now_utc().
31
- errors_on_exit (ErrorsOnExit): ErrorsOnExit object that logs errors on exit
31
+ error_handler (ErrorHandler): ErrorHandler object that logs errors on exit
32
32
  scrapers_to_run (Optional[ListTuple[str]]): Scrapers to run. Defaults to None (all scrapers).
33
33
  """
34
34
 
@@ -36,12 +36,12 @@ class Runner:
36
36
  self,
37
37
  countryiso3s: ListTuple[str],
38
38
  today: datetime = now_utc(),
39
- errors_on_exit: Optional[ErrorsOnExit] = None,
39
+ error_handler: Optional[ErrorHandler] = None,
40
40
  scrapers_to_run: Optional[ListTuple[str]] = None,
41
41
  ):
42
42
  self.countryiso3s = countryiso3s
43
43
  self.today = today
44
- self.errors_on_exit = errors_on_exit
44
+ self.error_handler = error_handler
45
45
  if isinstance(scrapers_to_run, tuple):
46
46
  scrapers_to_run = list(scrapers_to_run)
47
47
  self.scrapers_to_run: Optional[List[str]] = scrapers_to_run
@@ -73,7 +73,7 @@ class Runner:
73
73
  and scraper_name not in self.scrapers_to_run
74
74
  ):
75
75
  self.scrapers_to_run.append(scraper_name)
76
- scraper.errors_on_exit = self.errors_on_exit
76
+ scraper.error_handler = self.error_handler
77
77
  return scraper_name
78
78
 
79
79
  def add_customs(
@@ -142,7 +142,7 @@ class Runner:
142
142
  level_name,
143
143
  source_configuration,
144
144
  self.today,
145
- self.errors_on_exit,
145
+ self.error_handler,
146
146
  )
147
147
  if scraper_name not in self.scraper_names:
148
148
  self.scraper_names.append(scraper_name)
@@ -612,8 +612,8 @@ class Runner:
612
612
  if not Fallbacks.exist() or scraper.can_fallback is False:
613
613
  raise
614
614
  logger.exception(f"Using fallbacks for {scraper.name}!")
615
- if self.errors_on_exit:
616
- self.errors_on_exit.add(
615
+ if self.error_handler:
616
+ self.error_handler.add(
617
617
  f"Using fallbacks for {scraper.name}! Error: {format_exc()}"
618
618
  )
619
619
  for level in scraper.headers.keys():
@@ -17,7 +17,7 @@ from hdx.utilities.dateparse import (
17
17
  )
18
18
  from hdx.utilities.dictandlist import dict_of_lists_add
19
19
  from hdx.utilities.downloader import DownloadError
20
- from hdx.utilities.errors_onexit import ErrorsOnExit
20
+ from hdx.utilities.error_handler import ErrorHandler
21
21
  from hdx.utilities.text import ( # noqa: F401
22
22
  get_fraction_str,
23
23
  get_numeric_if_possible,
@@ -42,7 +42,7 @@ class ConfigurableScraper(BaseScraper):
42
42
  level_name (Optional[str]): Customised level_name name. Defaults to None (level).
43
43
  source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
44
44
  today (datetime): Value to use for today. Defaults to now_utc().
45
- errors_on_exit (Optional[ErrorsOnExit]): ErrorsOnExit object that logs errors on exit
45
+ error_handler (Optional[ErrorHandler]): ErrorHandler object that logs errors on exit
46
46
  **kwargs: Variables to use when evaluating template arguments in urls
47
47
  """
48
48
 
@@ -67,7 +67,7 @@ class ConfigurableScraper(BaseScraper):
67
67
  level_name: Optional[str] = None,
68
68
  source_configuration: Dict = {},
69
69
  today: datetime = now_utc(),
70
- errors_on_exit: Optional[ErrorsOnExit] = None,
70
+ error_handler: Optional[ErrorHandler] = None,
71
71
  **kwargs: Any,
72
72
  ):
73
73
  self.name = name
@@ -83,10 +83,10 @@ class ConfigurableScraper(BaseScraper):
83
83
  else:
84
84
  self.level_name: str = level_name
85
85
  self.countryiso3s = countryiso3s
86
- self.adminlevel = adminlevel
86
+ self.adminlevel: Optional[AdminLevel] = adminlevel
87
87
  self.today = today
88
88
  self.subsets = self.get_subsets_from_datasetinfo(datasetinfo)
89
- self.errors_on_exit: Optional[ErrorsOnExit] = errors_on_exit
89
+ self.error_handler: Optional[ErrorHandler] = error_handler
90
90
  self.variables = kwargs
91
91
  self.rowparser = None
92
92
  self.datasetinfo = copy.deepcopy(datasetinfo)
@@ -185,20 +185,14 @@ class RowParser:
185
185
  Returns:
186
186
  Iterator[Dict]: Input data with prefilter applied if specified and sorted if specified or deemed necessary
187
187
  """
188
- rows = []
189
- for row in iterator:
190
- if self.header_to_hxltag:
191
- newrow = {}
192
- for header in row:
193
- newrow[self.header_to_hxltag[header]] = row[header]
194
- row = newrow
195
- if self.stop_row:
196
- if all(
197
- row[key] == value for key, value in self.stop_row.items()
198
- ):
199
- break
200
- for newrow in self.flatten(row):
201
- rows.append(newrow)
188
+ if self.header_to_hxltag:
189
+ iterator = self.header_to_hxltag_rows(iterator)
190
+ if self.stop_row:
191
+ iterator = self.stop_rows(iterator)
192
+ if self.flatteninfo:
193
+ iterator = self.flatten_rows(iterator)
194
+ if self.prefilter:
195
+ iterator = (row for row in iterator if eval(self.prefilter))
202
196
  if not self.sort:
203
197
  if self.datecol:
204
198
  for subset in self.subsets:
@@ -212,15 +206,59 @@ class RowParser:
212
206
  )
213
207
  self.sort = {"keys": [self.datecol], "reverse": True}
214
208
  break
215
- if self.prefilter:
216
- rows = [row for row in rows if eval(self.prefilter)]
217
209
  if self.sort:
218
210
  keys = self.sort["keys"]
219
211
  reverse = self.sort.get("reverse", False)
220
- rows = sorted(rows, key=itemgetter(*keys), reverse=reverse)
221
- return rows
212
+ iterator = sorted(iterator, key=itemgetter(*keys), reverse=reverse)
213
+ return iterator
214
+
215
+ def header_to_hxltag_rows(
216
+ self, iterator: Iterator[Dict]
217
+ ) -> Generator[Dict, None, None]:
218
+ """Convert headers to HXL tags in keys
219
+
220
+ Args:
221
+ iterator (Iterator[Dict]): Input data
222
+
223
+ Returns:
224
+ Generator[Dict]: Rows where keys are HXL tags
225
+ """
226
+ for row in iterator:
227
+ newrow = {}
228
+ for header in row:
229
+ newrow[self.header_to_hxltag[header]] = row[header]
230
+ yield newrow
231
+
232
+ def stop_rows(
233
+ self, iterator: Iterator[Dict]
234
+ ) -> Generator[Dict, None, None]:
235
+ """Stop processing rows after condition met
236
+
237
+ Args:
238
+ iterator (Iterator[Dict]): Input data
239
+
240
+ Returns:
241
+ Generator[Dict]: Rows up to stop condition
242
+ """
243
+ for row in iterator:
244
+ if all(row[key] == value for key, value in self.stop_row.items()):
245
+ break
246
+ yield row
247
+
248
+ def flatten_rows(self, iterator: Iterator[Dict]) -> Iterator[Dict]:
249
+ """Flatten rows
250
+
251
+ Args:
252
+ iterator (Iterator[Dict]): Input data
253
+
254
+ Returns:
255
+ Generator[Dict]: Flattened rows
256
+ """
257
+ for row in iterator:
258
+ for newrow in self.flatten_row(row):
259
+ yield newrow
222
260
 
223
- def flatten(self, row: Dict) -> Generator[Dict, None, None]:
261
+ def flatten_row(self, row: Dict) -> Generator[Dict, None, None]:
224
262
  """Flatten a wide spreadsheet format into a long one
225
263
 
226
264
  Args:
@@ -229,9 +267,6 @@ class RowParser:
229
267
  Returns:
230
268
  Generator[Dict]: Flattened row(s)
231
269
  """
232
- if not self.flatteninfo:
233
- yield row
234
- return
235
270
  counters = [-1 for _ in self.flatteninfo]
236
271
  while True:
237
272
  newrow = copy.deepcopy(row)
@@ -314,7 +349,7 @@ class RowParser:
314
349
  adms[i], exact = Country.get_iso3_country_code_fuzzy(adm)
315
350
  elif i == 1:
316
351
  adms[i], exact = self.adminlevel.get_pcode(
317
- adms[0], adm, self.name
352
+ adms[0], adm, logname=self.name
318
353
  )
319
354
  if adms[i] not in self.adms[i]:
320
355
  adms[i] = None
@@ -1,3 +1,4 @@
1
+ import glob
1
2
  import logging
2
3
  from datetime import datetime
3
4
  from os.path import join
@@ -10,6 +11,7 @@ from slugify import slugify
10
11
 
11
12
  from . import get_startend_dates_from_time_period, match_template
12
13
  from .sources import Sources
14
+ from hdx.api.configuration import Configuration
13
15
  from hdx.data.dataset import Dataset
14
16
  from hdx.data.resource import Resource
15
17
  from hdx.utilities.dateparse import parse_date
@@ -115,6 +117,11 @@ class Read(Retrieve):
115
117
  for name in basic_auths:
116
118
  custom_configs[name] = {"basic_auth": basic_auths[name]}
117
119
  del kwargs["basic_auths"]
120
+ bearer_tokens = kwargs.get("bearer_tokens")
121
+ if bearer_tokens is not None:
122
+ for name in bearer_tokens:
123
+ custom_configs[name] = {"bearer_token": bearer_tokens[name]}
124
+ del kwargs["bearer_tokens"]
118
125
  param_auths = kwargs.get("param_auths")
119
126
  if param_auths is not None:
120
127
  for name in param_auths:
@@ -204,15 +211,19 @@ class Read(Retrieve):
204
211
  if headers is None:
205
212
  headers = 1
206
213
  datasetinfo["headers"] = 1
207
- kwargs["headers"] = headers
208
- if isinstance(headers, list):
209
- kwargs["fill_merged_cells"] = True
210
214
  format = datasetinfo["format"]
211
215
  kwargs["format"] = format
212
- if not sheet and format in ("xls", "xlsx"):
213
- sheet = 1
216
+ if format in ("xls", "xlsx"):
217
+ if not sheet:
218
+ sheet = 1
219
+ if isinstance(headers, list):
220
+ kwargs["fill_merged_cells"] = True
221
+ elif "fill_merged_cells" not in kwargs:
222
+ kwargs["fill_merged_cells"] = False
223
+ kwargs["xlsx2csv"] = datasetinfo.get("xlsx2csv", False)
214
224
  if sheet:
215
225
  kwargs["sheet"] = sheet
226
+ kwargs["headers"] = headers
216
227
  compression = datasetinfo.get("compression")
217
228
  if compression:
218
229
  kwargs["compression"] = compression
@@ -238,11 +249,14 @@ class Read(Retrieve):
238
249
  **kwargs,
239
250
  )
240
251
 
241
- def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
252
+ def read_dataset(
253
+ self, dataset_name: str, configuration: Optional[Configuration] = None
254
+ ) -> Optional[Dataset]:
242
255
  """Read HDX dataset
243
256
 
244
257
  Args:
245
258
  dataset_name (str): Dataset name
259
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
246
260
 
247
261
  Returns:
248
262
  Optional[Dataset]: The dataset that was read or None
@@ -252,7 +266,7 @@ class Read(Retrieve):
252
266
  logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
253
267
  dataset = Dataset.load_from_json(saved_path)
254
268
  else:
255
- dataset = Dataset.read_from_hdx(dataset_name)
269
+ dataset = Dataset.read_from_hdx(dataset_name, configuration)
256
270
  if self.save:
257
271
  logger.info(f"Saving dataset {dataset_name} in {saved_path}")
258
272
  if dataset is None:
@@ -261,6 +275,56 @@ class Read(Retrieve):
261
275
  dataset.save_to_json(saved_path, follow_urls=True)
262
276
  return dataset
263
277
 
278
+ def search_datasets(
279
+ self,
280
+ filename: str,
281
+ query: Optional[str] = "*:*",
282
+ configuration: Optional[Configuration] = None,
283
+ page_size: int = 1000,
284
+ **kwargs: Any,
285
+ ) -> List[Dataset]:
286
+ """Read HDX dataset
287
+
288
+ Args:
289
+ filename (str): Filename for saved files. Will be prefixed by underscore and a number.
290
+ query (Optional[str]): Query (in Solr format). Defaults to '*:*'.
291
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
292
+ page_size (int): Size of page to return. Defaults to 1000.
293
+ **kwargs: See below
294
+ fq (string): Any filter queries to apply
295
+ rows (int): Number of matching rows to return. Defaults to all datasets (sys.maxsize).
296
+ start (int): Offset in the complete result for where the set of returned datasets should begin
297
+ sort (string): Sorting of results. Defaults to 'relevance asc, metadata_modified desc' if rows<=page_size or 'metadata_modified asc' if rows>page_size.
298
+ facet (string): Whether to enable faceted results. Default to True.
299
+ facet.mincount (int): Minimum counts for facet fields should be included in the results
300
+ facet.limit (int): Maximum number of values the facet fields return (- = unlimited). Defaults to 50.
301
+ facet.field (List[str]): Fields to facet upon. Default is empty.
302
+ use_default_schema (bool): Use default package schema instead of custom schema. Defaults to False.
303
+
304
+ Returns:
305
+ List[Dataset]: list of datasets resulting from query
306
+ """
307
+
308
+ saved_path = join(self.saved_dir, filename)
309
+ if self.use_saved:
310
+ logger.info(
311
+ f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
312
+ )
313
+ datasets = []
314
+ for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
315
+ datasets.append(Dataset.load_from_json(file_path))
316
+ else:
317
+ datasets = Dataset.search_in_hdx(
318
+ query, configuration, page_size, **kwargs
319
+ )
320
+ if self.save:
321
+ for i, dataset in enumerate(datasets):
322
+ file_path = f"{saved_path}_{i}.json"
323
+ name = dataset["name"]
324
+ logger.info(f"Saving dataset {name} in {file_path}")
325
+ dataset.save_to_json(file_path, follow_urls=True)
326
+ return datasets
327
+
264
328
  @staticmethod
265
329
  def construct_filename(name: str, format: str):
266
330
  """Construct filename from name and format. The filename of the file
@@ -438,7 +502,10 @@ class Read(Retrieve):
438
502
  return self.hxl_info_file(name, format, url, **kwargs)
439
503
 
440
504
  def read_hdx_metadata(
441
- self, datasetinfo: Dict, do_resource_check: bool = True
505
+ self,
506
+ datasetinfo: Dict,
507
+ do_resource_check: bool = True,
508
+ configuration: Optional[Configuration] = None,
442
509
  ) -> Optional[Resource]:
443
510
  """Read metadata from HDX dataset and add to input dictionary. If url
444
511
  is not supplied, will look through resources for one that matches
@@ -454,13 +521,14 @@ class Read(Retrieve):
454
521
  Args:
455
522
  datasetinfo (Dict): Dictionary of information about dataset
456
523
  do_resource_check (bool): Whether to check resources. Defaults to False.
524
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
457
525
 
458
526
  Returns:
459
527
  Optional[Resource]: The resource if a url was not given
460
528
  """
461
529
  dataset_nameinfo = datasetinfo["dataset"]
462
530
  if isinstance(dataset_nameinfo, str):
463
- dataset = self.read_dataset(dataset_nameinfo)
531
+ dataset = self.read_dataset(dataset_nameinfo, configuration)
464
532
  resource = None
465
533
  url = datasetinfo.get("url")
466
534
  resource_name = datasetinfo.get("resource")
@@ -491,24 +559,24 @@ class Read(Retrieve):
491
559
  else:
492
560
  url = resource["url"] # otherwise set the url key in
493
561
  # datasetinfo to the resource url (by setting url here)
494
- datasetinfo[
495
- "hapi_resource_metadata"
496
- ] = self.get_hapi_resource_metadata(resource)
562
+ datasetinfo["hapi_resource_metadata"] = (
563
+ self.get_hapi_resource_metadata(resource)
564
+ )
497
565
  datasetinfo["url"] = url
498
566
  if "source_date" not in datasetinfo:
499
- datasetinfo[
500
- "source_date"
501
- ] = get_startend_dates_from_time_period(
502
- dataset, today=self.today
567
+ datasetinfo["source_date"] = (
568
+ get_startend_dates_from_time_period(
569
+ dataset, today=self.today
570
+ )
503
571
  )
504
572
  if "source" not in datasetinfo:
505
573
  datasetinfo["source"] = dataset["dataset_source"]
506
574
  if "source_url" not in datasetinfo:
507
575
  datasetinfo["source_url"] = dataset.get_hdx_url()
508
576
  Sources.standardise_datasetinfo_source_date(datasetinfo)
509
- datasetinfo[
510
- "hapi_dataset_metadata"
511
- ] = self.get_hapi_dataset_metadata(dataset, datasetinfo)
577
+ datasetinfo["hapi_dataset_metadata"] = (
578
+ self.get_hapi_dataset_metadata(dataset, datasetinfo)
579
+ )
512
580
  return resource
513
581
 
514
582
  if "source_date" not in datasetinfo:
@@ -527,7 +595,7 @@ class Read(Retrieve):
527
595
  for hxltag, dataset_name in dataset_nameinfo.items():
528
596
  dataset = datasets.get(dataset_name)
529
597
  if not dataset:
530
- dataset = self.read_dataset(dataset_name)
598
+ dataset = self.read_dataset(dataset_name, configuration)
531
599
  datasets[dataset_name] = dataset
532
600
  if source_date is not None:
533
601
  if hxltag == "default_dataset":
@@ -561,18 +629,22 @@ class Read(Retrieve):
561
629
  def read_hdx(
562
630
  self,
563
631
  datasetinfo: Dict,
632
+ configuration: Optional[Configuration] = None,
564
633
  **kwargs: Any,
565
634
  ) -> Tuple[List[str], Iterator[Dict]]:
566
635
  """Read data and metadata from HDX dataset
567
636
 
568
637
  Args:
569
638
  datasetinfo (Dict): Dictionary of information about dataset
639
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
570
640
  **kwargs: Parameters to pass to download_file call
571
641
 
572
642
  Returns:
573
643
  Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
574
644
  """
575
- resource = self.read_hdx_metadata(datasetinfo)
645
+ resource = self.read_hdx_metadata(
646
+ datasetinfo, configuration=configuration
647
+ )
576
648
  filename = kwargs.get("filename")
577
649
  if filename:
578
650
  del kwargs["filename"]
@@ -593,12 +665,14 @@ class Read(Retrieve):
593
665
  def read(
594
666
  self,
595
667
  datasetinfo: Dict,
668
+ configuration: Optional[Configuration] = None,
596
669
  **kwargs: Any,
597
670
  ) -> Tuple[List[str], Iterator[Dict]]:
598
671
  """Read data and metadata from HDX dataset
599
672
 
600
673
  Args:
601
674
  datasetinfo (Dict): Dictionary of information about dataset
675
+ configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
602
676
  **kwargs: Parameters to pass to download_file call
603
677
 
604
678
  Returns:
@@ -607,7 +681,9 @@ class Read(Retrieve):
607
681
  format = datasetinfo["format"]
608
682
  if format in ["json", "csv", "xls", "xlsx"]:
609
683
  if "dataset" in datasetinfo:
610
- headers, iterator = self.read_hdx(datasetinfo, **kwargs)
684
+ headers, iterator = self.read_hdx(
685
+ datasetinfo, configuration, **kwargs
686
+ )
611
687
  else:
612
688
  headers, iterator = self.read_tabular(datasetinfo, **kwargs)
613
689
  else:
@@ -0,0 +1,63 @@
1
+ """Populate the sector mapping."""
2
+
3
+ import logging
4
+ from copy import copy
5
+ from typing import Dict, Optional
6
+
7
+ from .reader import Read
8
+ from hdx.utilities.loader import load_yaml
9
+ from hdx.utilities.matching import get_code_from_name
10
+ from hdx.utilities.path import script_dir_plus_file
11
+ from hdx.utilities.text import normalise
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class Sector:
17
+ def __init__(
18
+ self,
19
+ configuration: Optional[Dict] = None,
20
+ ):
21
+ if configuration is None:
22
+ configuration = load_yaml(
23
+ script_dir_plus_file("sector_configuration.yaml", Sector)
24
+ )
25
+ self._datasetinfo = configuration["sector"]
26
+ self.data = copy(configuration["sector_map"])
27
+ self.unmatched = []
28
+ self.populate()
29
+
30
+ def populate(self) -> None:
31
+ logger.info("Populating sector mapping")
32
+
33
+ def parse_sector_values(code: str, name: str):
34
+ self.data[name] = code
35
+ self.data[code] = code
36
+ self.data[normalise(name)] = code
37
+ self.data[normalise(code)] = code
38
+
39
+ reader = Read.get_reader()
40
+ headers, iterator = reader.read(
41
+ self._datasetinfo, file_prefix="sector"
42
+ )
43
+ for row in iterator:
44
+ parse_sector_values(
45
+ code=row["#sector +code +acronym"],
46
+ name=row["#sector +name +preferred +i_en"],
47
+ )
48
+
49
+ extra_entries = {
50
+ "Cash": "Cash programming",
51
+ "Hum": "Humanitarian assistance (unspecified)",
52
+ "Multi": "Multi-sector (unspecified)",
53
+ "Intersectoral": "Intersectoral",
54
+ }
55
+ for code, name in extra_entries.items():
56
+ parse_sector_values(code=code, name=name)
57
+
58
+ def get_sector_code(self, sector: str) -> str | None:
59
+ return get_code_from_name(
60
+ name=sector,
61
+ code_lookup=self.data,
62
+ unmatched=self.unmatched,
63
+ )
@@ -0,0 +1,138 @@
1
+ sector:
2
+ dataset: "global-coordination-groups-beta"
3
+ resource: "Global Coordination Groups (Beta) CSV"
4
+ format: "csv"
5
+ headers: 2
6
+
7
+ sector_map:
8
+ abna: "SHL"
9
+ abri: "SHL"
10
+ abri bna: "SHL"
11
+ abris: "SHL"
12
+ abris ame: "SHL"
13
+ abris bna: "SHL"
14
+ abris bna cccm: "SHL"
15
+ abris durgence et nfi: "SHL"
16
+ abris nfi: "SHL"
17
+ action contre les mines: "PRO-MIN"
18
+ aee: "SHL"
19
+ agriculture: "FSC"
20
+ agua saneamiento e higiene: "WSH"
21
+ all: "Intersectoral"
22
+ alojamiento de emergencia: "SHL"
23
+ alojamiento de emergencia shelter: "SHL"
24
+ alojamientos y asentamientos: "SHL"
25
+ ame: "SHL"
26
+ ash: "WSH"
27
+ assainissement: "WSH"
28
+ camp coordination and camp management: "CCM"
29
+ camp coordination camp management: "CCM"
30
+ cash: "Cash"
31
+ cccm: "CCM"
32
+ ccs: "CCM"
33
+ cluster coordination: "CCM"
34
+ coord services support: "CCM"
35
+ coordinacion informacion: "CCM"
36
+ coordination: "CCM"
37
+ coordination et gestion des camps: "CCM"
38
+ eah: "WSH"
39
+ eau: "WSH"
40
+ eau assainissement et hygiene: "WSH"
41
+ eau hygiene: "WSH"
42
+ eau hygiene assainissement: "WSH"
43
+ eau hygiene et assainissement: "WSH"
44
+ educacion: "EDU"
45
+ educacion en emergencias: "EDU"
46
+ education: "EDU"
47
+ eha: "WSH"
48
+ emergency shelter and non food items: "SHL"
49
+ epah: "WSH"
50
+ erl: "ERY"
51
+ esnfi: "SHL"
52
+ explosive hazards: "PRO-MIN"
53
+ food: "FSC"
54
+ food security and agriculture: "FSC"
55
+ food security and livelihoods: "FSC"
56
+ food security and nutrition: "FSC"
57
+ food security livelihood: "FSC"
58
+ fsl: "FSC"
59
+ gestion des sites daccueil temporaires: "SHL"
60
+ gbv: "PRO-GBV"
61
+ hlp: "PRO-HLP"
62
+ humanitaire: "Hum"
63
+ hygiene: "WSH"
64
+ hygiene assainissement: "WSH"
65
+ intercluster: "Multi" # From Somalia 3W, hopefully not to be confused with intersectoral
66
+ logement terre et biens: "PRO-HLP"
67
+ logistica: "LOG"
68
+ logistique: "LOG"
69
+ manejo y gestion de campamentos: "CCM"
70
+ ms: "Multi"
71
+ multi secteur: "Multi"
72
+ multisectoriel: "Multi"
73
+ nutricion: "NUT"
74
+ nutrition: "NUT"
75
+ operatioanl presence water sanitation hygiene: "WSH"
76
+ operational presence education in emergencies: "EDU"
77
+ operational presence emergency shelter non food items: "SHL"
78
+ operational presence food security agriculture: "FSC"
79
+ operational presence health: "HEA"
80
+ operational presence nutrition: "NUT"
81
+ operational presence protection: "PRO"
82
+ pro cpm: "PRO-CPN"
83
+ pronna: "PRO-CPN"
84
+ propg: "PRO"
85
+ proteccion infantil: "PRO-CPN"
86
+ protection: "PRO"
87
+ protection de lenfance: "PRO-CPN"
88
+ protection de lenfant: "PRO-CPN"
89
+ protection generale: "PRO"
90
+ protection logement terre et propriete: "PRO-HLP"
91
+ protection ltb: "PRO-HLP"
92
+ protection lutte anti mines: "PRO-MIN"
93
+ protection pe: "PRO-CPN"
94
+ protection protection de lenfant: "PRO-CPN"
95
+ protection violences basees sur le genre: "PRO-GBV"
96
+ protection vgb: "PRO-GBV"
97
+ proteccion: "PRO"
98
+ provbg: "PRO-GBV"
99
+ psea: "PRO-GBV"
100
+ rapid response mechanism: "ERY"
101
+ rcf: "CCM"
102
+ rcf education: "EDU"
103
+ rcf food security and livelihoods: "FSC"
104
+ rcf health and nutrtion: "HEA"
105
+ rcf protection: "PRO"
106
+ recuperacion temprana: "ERY"
107
+ relevement precoce: "ERY"
108
+ relevement rapide: "ERY"
109
+ refugee response: "CCM"
110
+ refugees migrants multi sector: "CCM"
111
+ reponse aux refugies: "CCM"
112
+ sa: "FSC"
113
+ sal: "HEA"
114
+ salud: "HEA"
115
+ samv: "FSC"
116
+ sante: "HEA"
117
+ securite alimentaire: "FSC"
118
+ seguridad alimentaria: "FSC"
119
+ seguridad alimentaria y nutricion: "FSC"
120
+ services humanitaires communs: "Hum"
121
+ sexual and reproductive health: "HEA"
122
+ shelter: "SHL"
123
+ shelter nfi: "SHL"
124
+ shelter nfis: "SHL"
125
+ shelter and nfi: "SHL"
126
+ shelter and nfis: "SHL"
127
+ shelter and non food items: "SHL"
128
+ site management: "CCM"
129
+ snfi: "SHL"
130
+ telecommunications: "TEL"
131
+ telecommunications durgence: "TEL"
132
+ telecomunicaciones de emergencia: "TEL"
133
+ vbg: "PRO-GBV"
134
+ violences basees sur le genre: "PRO-GBV"
135
+ violence basee sur le genre: "PRO-GBV"
136
+ violencia basada en genero: "PRO-GBV"
137
+ wash: "WSH"
138
+ water sanitation and hygiene: "WSH"
@@ -282,9 +282,9 @@ class Sources:
282
282
  if no_sources:
283
283
  source_configuration["no_sources"] = True
284
284
  return source_configuration
285
- source_configuration[
286
- "should_overwrite_sources"
287
- ] = should_overwrite_sources
285
+ source_configuration["should_overwrite_sources"] = (
286
+ should_overwrite_sources
287
+ )
288
288
  if suffix_attribute:
289
289
  source_configuration["suffix_attribute"] = suffix_attribute
290
290
  return source_configuration
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: hdx-python-scraper
3
- Version: 2.3.5
3
+ Version: 2.5.3
4
4
  Summary: HDX Python scraper utilities to assemble data from multiple sources
5
5
  Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
6
6
  Author-email: Michael Rans <rans@email.com>
@@ -26,13 +26,14 @@ Classifier: Programming Language :: Python :: 3.12
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Requires-Python: >=3.8
28
28
  Requires-Dist: gspread
29
- Requires-Dist: hdx-python-api>=6.2.1
30
- Requires-Dist: hdx-python-country>=3.6.4
29
+ Requires-Dist: hdx-python-api>=6.3.7
30
+ Requires-Dist: hdx-python-country>=3.8.6
31
+ Requires-Dist: hdx-python-utilities>=3.8.2
31
32
  Requires-Dist: regex
32
33
  Provides-Extra: dev
33
34
  Requires-Dist: pre-commit; extra == 'dev'
34
35
  Provides-Extra: pandas
35
- Requires-Dist: pandas>=2.1.3; extra == 'pandas'
36
+ Requires-Dist: pandas>=2.2.2; extra == 'pandas'
36
37
  Provides-Extra: test
37
38
  Requires-Dist: pytest; extra == 'test'
38
39
  Requires-Dist: pytest-cov; extra == 'test'
@@ -0,0 +1,27 @@
1
+ hdx/scraper/framework/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
+ hdx/scraper/framework/_version.py,sha256=YlFdzLR6C3fl-9jq4_71rr5eVxx1hHLisz6muXMUhiQ,411
3
+ hdx/scraper/framework/base_scraper.py,sha256=vvwljQ5QWr6hpCjOS89RG1pvC955aLoPvm6pSovO75o,15432
4
+ hdx/scraper/framework/runner.py,sha256=GFnZM9HciZFibwwRgDHVk9F_y2n27ctpRwyeD1_ZcKw,53538
5
+ hdx/scraper/framework/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ hdx/scraper/framework/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
7
+ hdx/scraper/framework/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
8
+ hdx/scraper/framework/outputs/googlesheets.py,sha256=jLAfXz4usmLFrePxRIsMflxKPzSGv9T3jlMpSV-s4II,3087
9
+ hdx/scraper/framework/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
10
+ hdx/scraper/framework/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ hdx/scraper/framework/scrapers/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
12
+ hdx/scraper/framework/scrapers/configurable_scraper.py,sha256=PYPtU9XZALNx-2Jr8a8kVVDsT2j9yGgBaw6wXhztQIM,20612
13
+ hdx/scraper/framework/scrapers/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
14
+ hdx/scraper/framework/scrapers/rowparser.py,sha256=bH05JUqViIVes9T7gWp0D2778BlFiJuNHmdovSFdFoI,15614
15
+ hdx/scraper/framework/scrapers/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
16
+ hdx/scraper/framework/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
17
+ hdx/scraper/framework/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
+ hdx/scraper/framework/utilities/reader.py,sha256=0XQ335Qj0ihafDklsXhDa5GHLux4FThIM4oZt1B5uLo,26814
19
+ hdx/scraper/framework/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
+ hdx/scraper/framework/utilities/sector.py,sha256=rl_TceRYc5YRoLccr0ABCM42ZLLtLzezWWWQ5YtbQDE,1947
21
+ hdx/scraper/framework/utilities/sector_configuration.yaml,sha256=LAUR5xfLU5qua5qtc3TcwEei0sD1zoCb_vfAxD7Grb8,3894
22
+ hdx/scraper/framework/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
23
+ hdx/scraper/framework/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
24
+ hdx_python_scraper-2.5.3.dist-info/METADATA,sha256=eJwqy5OyM3ngW2rUWWqTUpjmnDQy8ChsDLivhoAgypI,3361
25
+ hdx_python_scraper-2.5.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
26
+ hdx_python_scraper-2.5.3.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
27
+ hdx_python_scraper-2.5.3.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.21.1
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,25 +0,0 @@
1
- hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
2
- hdx/scraper/_version.py,sha256=-9aYLvgAp04zL8yFAMPjvf6kLKgqW1mLgyuk6XA3LcE,411
3
- hdx/scraper/base_scraper.py,sha256=oo9oMqCUpK8_hPwcTz2PAKabzoyU0BQu5dgWgsFa55Y,15431
4
- hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
5
- hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
7
- hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
8
- hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
9
- hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
10
- hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
11
- hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
13
- hdx/scraper/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
14
- hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1Ep39QY,3087
15
- hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
16
- hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
17
- hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
18
- hdx/scraper/utilities/reader.py,sha256=hexLIJW3CdP4DmobqMM-Z2d6pgcCs1zWWBW-stqoeNU,22975
19
- hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
20
- hdx/scraper/utilities/sources.py,sha256=VNhFYSUM2xeDlN6y4Ya9_0BskjPtjwQZmCKnQgpOemQ,11511
21
- hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
22
- hdx_python_scraper-2.3.5.dist-info/METADATA,sha256=jYBTVEB111S1R3Cj8fZByzM4E3nRRKCr31bsCPstjPA,3318
23
- hdx_python_scraper-2.3.5.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
24
- hdx_python_scraper-2.3.5.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
25
- hdx_python_scraper-2.3.5.dist-info/RECORD,,
File without changes
File without changes
File without changes
File without changes