PyPI - hdx-python-scraper - Versions diffs - 2.3.4__py3-none-any.whl → 2.5.2__py3-none-any.whl - Mend

hdx-python-scraper 2.3.4py3-none-any.whl → 2.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

hdx/scraper/{__init__.py → framework/__init__.py} RENAMED Viewed

File without changes

hdx/scraper/{_version.py → framework/_version.py} RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2.3.4'
-__version_tuple__ = version_tuple = (2, 3, 4)
+__version__ = version = '2.5.2'
+__version_tuple__ = version_tuple = (2, 5, 2)

hdx/scraper/{base_scraper.py → framework/base_scraper.py} RENAMED Viewed

@@ -36,7 +36,7 @@ class BaseScraper(ABC):
             self.reader = datasetinfo.get("reader", name)
         self.setup(headers, source_configuration)
         self.datasetinfo = deepcopy(datasetinfo)
-        self.errors_on_exit = None
+        self.error_handler = None
         self.can_fallback = True
     def setup(
@@ -141,9 +141,9 @@ class BaseScraper(ABC):
             "should_overwrite_sources"
         )
         if should_overwrite_sources is not None:
-            self.source_configuration[
-                "should_overwrite_sources"
-            ] = should_overwrite_sources
+            self.source_configuration["should_overwrite_sources"] = (
+                should_overwrite_sources
+            )
         source = self.datasetinfo["source"]
         if isinstance(source, str):
             source = {"default_source": source}

hdx/scraper/{outputs → framework/outputs}/googlesheets.py RENAMED Viewed

@@ -91,4 +91,4 @@ class GoogleSheets(BaseOutput):
                 df.fillna("NaN", inplace=True)
                 rows.extend(df.values.tolist())
                 values = rows
-            tab.update("A1", values)
+            tab.update(values, "A1")

hdx/scraper/{runner.py → framework/runner.py} RENAMED Viewed

@@ -5,18 +5,18 @@ from traceback import format_exc
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from .base_scraper import BaseScraper
-from .configurable.aggregator import Aggregator
-from .configurable.resource_downloader import ResourceDownloader
-from .configurable.scraper import ConfigurableScraper
-from .configurable.timeseries import TimeSeries
 from .outputs.base import BaseOutput
+from .scrapers.aggregator import Aggregator
+from .scrapers.configurable_scraper import ConfigurableScraper
+from .scrapers.resource_downloader import ResourceDownloader
+from .scrapers.timeseries import TimeSeries
 from .utilities import get_startend_dates_from_time_period
 from .utilities.fallbacks import Fallbacks
 from .utilities.reader import Read
 from .utilities.sources import Sources
 from hdx.location.adminlevel import AdminLevel
 from hdx.utilities.dateparse import now_utc
-from hdx.utilities.errors_onexit import ErrorsOnExit
+from hdx.utilities.error_handler import ErrorHandler
 from hdx.utilities.typehint import ListTuple
 logger = logging.getLogger(__name__)
@@ -28,7 +28,7 @@ class Runner:
     Args:
         countryiso3s (ListTuple[str]): List of ISO3 country codes to process
         today (datetime): Value to use for today. Defaults to now_utc().
-        errors_on_exit (ErrorsOnExit): ErrorsOnExit object that logs errors on exit
+        error_handler (ErrorHandler): ErrorHandler object that logs errors on exit
         scrapers_to_run (Optional[ListTuple[str]]): Scrapers to run. Defaults to None (all scrapers).
     """
@@ -36,12 +36,12 @@ class Runner:
         self,
         countryiso3s: ListTuple[str],
         today: datetime = now_utc(),
-        errors_on_exit: Optional[ErrorsOnExit] = None,
+        error_handler: Optional[ErrorHandler] = None,
         scrapers_to_run: Optional[ListTuple[str]] = None,
     ):
         self.countryiso3s = countryiso3s
         self.today = today
-        self.errors_on_exit = errors_on_exit
+        self.error_handler = error_handler
         if isinstance(scrapers_to_run, tuple):
             scrapers_to_run = list(scrapers_to_run)
         self.scrapers_to_run: Optional[List[str]] = scrapers_to_run
@@ -73,7 +73,7 @@ class Runner:
             and scraper_name not in self.scrapers_to_run
         ):
             self.scrapers_to_run.append(scraper_name)
-        scraper.errors_on_exit = self.errors_on_exit
+        scraper.error_handler = self.error_handler
         return scraper_name
     def add_customs(
@@ -106,6 +106,7 @@ class Runner:
         source_configuration: Dict = {},
         suffix: Optional[str] = None,
         force_add_to_run: bool = False,
+        countryiso3s: Optional[List[str]] = None,
     ) -> str:
         """Add configurable scraper to the run. If running specific scrapers rather than
         all, and you want to force the inclusion of the scraper in the run regardless of
@@ -121,6 +122,7 @@ class Runner:
             source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
             suffix (Optional[str]): Suffix to add to the scraper name
             force_add_to_run (bool): Whether to force include the scraper in the next run
+            countryiso3s (Optional[List[str]]): Override list of country iso3s. Defaults to None.
         Returns:
             str: scraper name (including suffix if set)
@@ -129,16 +131,18 @@ class Runner:
             scraper_name = f"{name}{suffix}"
         else:
             scraper_name = name
+        if not countryiso3s:
+            countryiso3s = self.countryiso3s
         self.scrapers[scraper_name] = ConfigurableScraper(
             name,
             datasetinfo,
             level,
-            self.countryiso3s,
+            countryiso3s,
             adminlevel,
             level_name,
             source_configuration,
             self.today,
-            self.errors_on_exit,
+            self.error_handler,
         )
         if scraper_name not in self.scraper_names:
             self.scraper_names.append(scraper_name)
@@ -159,6 +163,7 @@ class Runner:
         source_configuration: Dict = {},
         suffix: Optional[str] = None,
         force_add_to_run: bool = False,
+        countryiso3s: Optional[List[str]] = None,
     ) -> List[str]:
         """Add multiple configurable scrapers to the run. If running specific scrapers
         rather than all, and you want to force the inclusion of the scraper in the run
@@ -173,6 +178,7 @@ class Runner:
             source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
             suffix (Optional[str]): Suffix to add to the scraper name
             force_add_to_run (bool): Whether to force include the scraper in the next run
+            countryiso3s (Optional[List[str]]): Override list of country iso3s. Defaults to None.
         Returns:
             List[str]: scraper names (including suffix if set)
@@ -190,6 +196,7 @@ class Runner:
                     source_configuration,
                     suffix,
                     force_add_to_run,
+                    countryiso3s,
                 )
             )
         return keys
@@ -516,6 +523,21 @@ class Runner:
             raise ValueError(f"No such scraper {name}!")
         return scraper
+    def delete_scraper(self, name: str) -> bool:
+        """Delete scraper with given name
+        Args:
+            name (str): Name of scraper
+        Returns:
+            bool: True if the scraper was present, False if not
+        """
+        if name not in self.scraper_names:
+            return False
+        self.scraper_names.remove(name)
+        del self.scrapers[name]
+        return True
     def add_instance_variables(self, name: str, **kwargs: Any) -> None:
         """Add instance variables to scraper instance given scraper name
@@ -590,8 +612,8 @@ class Runner:
                 if not Fallbacks.exist() or scraper.can_fallback is False:
                     raise
                 logger.exception(f"Using fallbacks for {scraper.name}!")
-                if self.errors_on_exit:
-                    self.errors_on_exit.add(
+                if self.error_handler:
+                    self.error_handler.add(
                         f"Using fallbacks for {scraper.name}! Error: {format_exc()}"
                     )
                 for level in scraper.headers.keys():

hdx/scraper/{configurable/scraper.py → framework/scrapers/configurable_scraper.py} RENAMED Viewed

@@ -17,7 +17,7 @@ from hdx.utilities.dateparse import (
 )
 from hdx.utilities.dictandlist import dict_of_lists_add
 from hdx.utilities.downloader import DownloadError
-from hdx.utilities.errors_onexit import ErrorsOnExit
+from hdx.utilities.error_handler import ErrorHandler
 from hdx.utilities.text import (  # noqa: F401
     get_fraction_str,
     get_numeric_if_possible,
@@ -42,7 +42,7 @@ class ConfigurableScraper(BaseScraper):
         level_name (Optional[str]): Customised level_name name. Defaults to None (level).
         source_configuration (Dict): Configuration for sources. Defaults to empty dict (use defaults).
         today (datetime): Value to use for today. Defaults to now_utc().
-        errors_on_exit (Optional[ErrorsOnExit]): ErrorsOnExit object that logs errors on exit
+        error_handler (Optional[ErrorHandler]): ErrorHandler object that logs errors on exit
         **kwargs: Variables to use when evaluating template arguments in urls
     """
@@ -67,7 +67,7 @@ class ConfigurableScraper(BaseScraper):
         level_name: Optional[str] = None,
         source_configuration: Dict = {},
         today: datetime = now_utc(),
-        errors_on_exit: Optional[ErrorsOnExit] = None,
+        error_handler: Optional[ErrorHandler] = None,
         **kwargs: Any,
     ):
         self.name = name
@@ -83,10 +83,10 @@ class ConfigurableScraper(BaseScraper):
         else:
             self.level_name: str = level_name
         self.countryiso3s = countryiso3s
-        self.adminlevel = adminlevel
+        self.adminlevel: Optional[AdminLevel] = adminlevel
         self.today = today
         self.subsets = self.get_subsets_from_datasetinfo(datasetinfo)
-        self.errors_on_exit: Optional[ErrorsOnExit] = errors_on_exit
+        self.error_handler: Optional[ErrorHandler] = error_handler
         self.variables = kwargs
         self.rowparser = None
         self.datasetinfo = copy.deepcopy(datasetinfo)

hdx/scraper/{configurable → framework/scrapers}/rowparser.py RENAMED Viewed

@@ -185,20 +185,14 @@ class RowParser:
         Returns:
             Iterator[Dict]: Input data with prefilter applied if specified and sorted if specified or deemed necessary
         """
-        rows = []
-        for row in iterator:
-            if self.header_to_hxltag:
-                newrow = {}
-                for header in row:
-                    newrow[self.header_to_hxltag[header]] = row[header]
-                row = newrow
-            if self.stop_row:
-                if all(
-                    row[key] == value for key, value in self.stop_row.items()
-                ):
-                    break
-            for newrow in self.flatten(row):
-                rows.append(newrow)
+        if self.header_to_hxltag:
+            iterator = self.header_to_hxltag_rows(iterator)
+        if self.stop_row:
+            iterator = self.stop_rows(iterator)
+        if self.flatteninfo:
+            iterator = self.flatten_rows(iterator)
+        if self.prefilter:
+            iterator = (row for row in iterator if eval(self.prefilter))
         if not self.sort:
             if self.datecol:
                 for subset in self.subsets:
@@ -212,15 +206,59 @@ class RowParser:
                         )
                         self.sort = {"keys": [self.datecol], "reverse": True}
                         break
-        if self.prefilter:
-            rows = [row for row in rows if eval(self.prefilter)]
         if self.sort:
             keys = self.sort["keys"]
             reverse = self.sort.get("reverse", False)
-            rows = sorted(rows, key=itemgetter(*keys), reverse=reverse)
-        return rows
+            iterator = sorted(iterator, key=itemgetter(*keys), reverse=reverse)
+        return iterator
+    def header_to_hxltag_rows(
+        self, iterator: Iterator[Dict]
+    ) -> Generator[Dict, None, None]:
+        """Convert headers to HXL tags in keys
+        Args:
+            iterator (Iterator[Dict]): Input data
+        Returns:
+            Generator[Dict]: Rows where keys are HXL tags
+        """
+        for row in iterator:
+            newrow = {}
+            for header in row:
+                newrow[self.header_to_hxltag[header]] = row[header]
+            yield newrow
+    def stop_rows(
+        self, iterator: Iterator[Dict]
+    ) -> Generator[Dict, None, None]:
+        """Stop processing rows after condition met
+        Args:
+            iterator (Iterator[Dict]): Input data
+        Returns:
+            Generator[Dict]: Rows up to stop condition
+        """
+        for row in iterator:
+            if all(row[key] == value for key, value in self.stop_row.items()):
+                break
+            yield row
+    def flatten_rows(self, iterator: Iterator[Dict]) -> Iterator[Dict]:
+        """Flatten rows
+        Args:
+            iterator (Iterator[Dict]): Input data
+        Returns:
+            Generator[Dict]: Flattened rows
+        """
+        for row in iterator:
+            for newrow in self.flatten_row(row):
+                yield newrow
-    def flatten(self, row: Dict) -> Generator[Dict, None, None]:
+    def flatten_row(self, row: Dict) -> Generator[Dict, None, None]:
         """Flatten a wide spreadsheet format into a long one
         Args:
@@ -229,9 +267,6 @@ class RowParser:
         Returns:
             Generator[Dict]: Flattened row(s)
         """
-        if not self.flatteninfo:
-            yield row
-            return
         counters = [-1 for _ in self.flatteninfo]
         while True:
             newrow = copy.deepcopy(row)
@@ -314,7 +349,7 @@ class RowParser:
                     adms[i], exact = Country.get_iso3_country_code_fuzzy(adm)
                 elif i == 1:
                     adms[i], exact = self.adminlevel.get_pcode(
-                        adms[0], adm, self.name
+                        adms[0], adm, logname=self.name
                     )
                 if adms[i] not in self.adms[i]:
                     adms[i] = None

hdx/scraper/{utilities → framework/utilities}/reader.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import glob
 import logging
 from datetime import datetime
 from os.path import join
@@ -10,6 +11,7 @@ from slugify import slugify
 from . import get_startend_dates_from_time_period, match_template
 from .sources import Sources
+from hdx.api.configuration import Configuration
 from hdx.data.dataset import Dataset
 from hdx.data.resource import Resource
 from hdx.utilities.dateparse import parse_date
@@ -204,15 +206,19 @@ class Read(Retrieve):
         if headers is None:
             headers = 1
             datasetinfo["headers"] = 1
-        kwargs["headers"] = headers
-        if isinstance(headers, list):
-            kwargs["fill_merged_cells"] = True
         format = datasetinfo["format"]
         kwargs["format"] = format
-        if not sheet and format in ("xls", "xlsx"):
-            sheet = 1
+        if format in ("xls", "xlsx"):
+            if not sheet:
+                sheet = 1
+            if isinstance(headers, list):
+                kwargs["fill_merged_cells"] = True
+            elif "fill_merged_cells" not in kwargs:
+                kwargs["fill_merged_cells"] = False
+            kwargs["xlsx2csv"] = datasetinfo.get("xlsx2csv", False)
         if sheet:
             kwargs["sheet"] = sheet
+        kwargs["headers"] = headers
         compression = datasetinfo.get("compression")
         if compression:
             kwargs["compression"] = compression
@@ -238,11 +244,14 @@ class Read(Retrieve):
             **kwargs,
         )
-    def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
+    def read_dataset(
+        self, dataset_name: str, configuration: Optional[Configuration] = None
+    ) -> Optional[Dataset]:
         """Read HDX dataset
         Args:
             dataset_name (str): Dataset name
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
         Returns:
             Optional[Dataset]: The dataset that was read or None
@@ -252,7 +261,7 @@ class Read(Retrieve):
             logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
             dataset = Dataset.load_from_json(saved_path)
         else:
-            dataset = Dataset.read_from_hdx(dataset_name)
+            dataset = Dataset.read_from_hdx(dataset_name, configuration)
             if self.save:
                 logger.info(f"Saving dataset {dataset_name} in {saved_path}")
                 if dataset is None:
@@ -261,6 +270,56 @@ class Read(Retrieve):
                     dataset.save_to_json(saved_path, follow_urls=True)
         return dataset
+    def search_datasets(
+        self,
+        filename: str,
+        query: Optional[str] = "*:*",
+        configuration: Optional[Configuration] = None,
+        page_size: int = 1000,
+        **kwargs: Any,
+    ) -> List[Dataset]:
+        """Read HDX dataset
+        Args:
+            filename (str): Filename for saved files. Will be prefixed by underscore and a number.
+            query (Optional[str]): Query (in Solr format). Defaults to '*:*'.
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
+            page_size (int): Size of page to return. Defaults to 1000.
+            **kwargs: See below
+            fq (string): Any filter queries to apply
+            rows (int): Number of matching rows to return. Defaults to all datasets (sys.maxsize).
+            start (int): Offset in the complete result for where the set of returned datasets should begin
+            sort (string): Sorting of results. Defaults to 'relevance asc, metadata_modified desc' if rows<=page_size or 'metadata_modified asc' if rows>page_size.
+            facet (string): Whether to enable faceted results. Default to True.
+            facet.mincount (int): Minimum counts for facet fields should be included in the results
+            facet.limit (int): Maximum number of values the facet fields return (- = unlimited). Defaults to 50.
+            facet.field (List[str]): Fields to facet upon. Default is empty.
+            use_default_schema (bool): Use default package schema instead of custom schema. Defaults to False.
+        Returns:
+            List[Dataset]: list of datasets resulting from query
+        """
+        saved_path = join(self.saved_dir, filename)
+        if self.use_saved:
+            logger.info(
+                f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
+            )
+            datasets = []
+            for file_path in sorted(glob.glob(f"{saved_path}_*.json")):
+                datasets.append(Dataset.load_from_json(file_path))
+        else:
+            datasets = Dataset.search_in_hdx(
+                query, configuration, page_size, **kwargs
+            )
+            if self.save:
+                for i, dataset in enumerate(datasets):
+                    file_path = f"{saved_path}_{i}.json"
+                    name = dataset["name"]
+                    logger.info(f"Saving dataset {name} in {file_path}")
+                    dataset.save_to_json(file_path, follow_urls=True)
+        return datasets
     @staticmethod
     def construct_filename(name: str, format: str):
         """Construct filename from name and format. The filename of the file
@@ -438,7 +497,10 @@ class Read(Retrieve):
         return self.hxl_info_file(name, format, url, **kwargs)
     def read_hdx_metadata(
-        self, datasetinfo: Dict, do_resource_check: bool = True
+        self,
+        datasetinfo: Dict,
+        do_resource_check: bool = True,
+        configuration: Optional[Configuration] = None,
     ) -> Optional[Resource]:
         """Read metadata from HDX dataset and add to input dictionary. If url
         is not supplied, will look through resources for one that matches
@@ -454,13 +516,14 @@ class Read(Retrieve):
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
             do_resource_check (bool): Whether to check resources. Defaults to False.
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
         Returns:
             Optional[Resource]: The resource if a url was not given
         """
         dataset_nameinfo = datasetinfo["dataset"]
         if isinstance(dataset_nameinfo, str):
-            dataset = self.read_dataset(dataset_nameinfo)
+            dataset = self.read_dataset(dataset_nameinfo, configuration)
             resource = None
             url = datasetinfo.get("url")
             resource_name = datasetinfo.get("resource")
@@ -491,24 +554,24 @@ class Read(Retrieve):
                 else:
                     url = resource["url"]  # otherwise set the url key in
                     # datasetinfo to the resource url (by setting url here)
-                datasetinfo[
-                    "hapi_resource_metadata"
-                ] = self.get_hapi_resource_metadata(resource)
+                datasetinfo["hapi_resource_metadata"] = (
+                    self.get_hapi_resource_metadata(resource)
+                )
                 datasetinfo["url"] = url
             if "source_date" not in datasetinfo:
-                datasetinfo[
-                    "source_date"
-                ] = get_startend_dates_from_time_period(
-                    dataset, today=self.today
+                datasetinfo["source_date"] = (
+                    get_startend_dates_from_time_period(
+                        dataset, today=self.today
+                    )
                 )
             if "source" not in datasetinfo:
                 datasetinfo["source"] = dataset["dataset_source"]
             if "source_url" not in datasetinfo:
                 datasetinfo["source_url"] = dataset.get_hdx_url()
             Sources.standardise_datasetinfo_source_date(datasetinfo)
-            datasetinfo[
-                "hapi_dataset_metadata"
-            ] = self.get_hapi_dataset_metadata(dataset, datasetinfo)
+            datasetinfo["hapi_dataset_metadata"] = (
+                self.get_hapi_dataset_metadata(dataset, datasetinfo)
+            )
             return resource
         if "source_date" not in datasetinfo:
@@ -527,7 +590,7 @@ class Read(Retrieve):
         for hxltag, dataset_name in dataset_nameinfo.items():
             dataset = datasets.get(dataset_name)
             if not dataset:
-                dataset = self.read_dataset(dataset_name)
+                dataset = self.read_dataset(dataset_name, configuration)
                 datasets[dataset_name] = dataset
             if source_date is not None:
                 if hxltag == "default_dataset":
@@ -561,18 +624,22 @@ class Read(Retrieve):
     def read_hdx(
         self,
         datasetinfo: Dict,
+        configuration: Optional[Configuration] = None,
         **kwargs: Any,
     ) -> Tuple[List[str], Iterator[Dict]]:
         """Read data and metadata from HDX dataset
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
             **kwargs: Parameters to pass to download_file call
         Returns:
             Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
         """
-        resource = self.read_hdx_metadata(datasetinfo)
+        resource = self.read_hdx_metadata(
+            datasetinfo, configuration=configuration
+        )
         filename = kwargs.get("filename")
         if filename:
             del kwargs["filename"]
@@ -593,12 +660,14 @@ class Read(Retrieve):
     def read(
         self,
         datasetinfo: Dict,
+        configuration: Optional[Configuration] = None,
         **kwargs: Any,
     ) -> Tuple[List[str], Iterator[Dict]]:
         """Read data and metadata from HDX dataset
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
             **kwargs: Parameters to pass to download_file call
         Returns:
@@ -607,7 +676,9 @@ class Read(Retrieve):
         format = datasetinfo["format"]
         if format in ["json", "csv", "xls", "xlsx"]:
             if "dataset" in datasetinfo:
-                headers, iterator = self.read_hdx(datasetinfo, **kwargs)
+                headers, iterator = self.read_hdx(
+                    datasetinfo, configuration, **kwargs
+                )
             else:
                 headers, iterator = self.read_tabular(datasetinfo, **kwargs)
         else:

hdx/scraper/framework/utilities/sector.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Populate the sector mapping."""
+import logging
+from copy import copy
+from typing import Dict, Optional
+from .reader import Read
+from hdx.utilities.loader import load_yaml
+from hdx.utilities.matching import get_code_from_name
+from hdx.utilities.path import script_dir_plus_file
+from hdx.utilities.text import normalise
+logger = logging.getLogger(__name__)
+class Sector:
+    def __init__(
+        self,
+        configuration: Optional[Dict] = None,
+    ):
+        if configuration is None:
+            configuration = load_yaml(
+                script_dir_plus_file("sector_configuration.yaml", Sector)
+            )
+        self._datasetinfo = configuration["sector"]
+        self.data = copy(configuration["sector_map"])
+        self.unmatched = []
+        self.populate()
+    def populate(self) -> None:
+        logger.info("Populating sector mapping")
+        def parse_sector_values(code: str, name: str):
+            self.data[name] = code
+            self.data[code] = code
+            self.data[normalise(name)] = code
+            self.data[normalise(code)] = code
+        reader = Read.get_reader()
+        headers, iterator = reader.read(
+            self._datasetinfo, file_prefix="sector"
+        )
+        for row in iterator:
+            parse_sector_values(
+                code=row["#sector +code +acronym"],
+                name=row["#sector +name +preferred +i_en"],
+            )
+        extra_entries = {
+            "Cash": "Cash programming",
+            "Hum": "Humanitarian assistance (unspecified)",
+            "Multi": "Multi-sector (unspecified)",
+            "Intersectoral": "Intersectoral",
+        }
+        for code, name in extra_entries.items():
+            parse_sector_values(code=code, name=name)
+    def get_sector_code(self, sector: str) -> str | None:
+        return get_code_from_name(
+            name=sector,
+            code_lookup=self.data,
+            unmatched=self.unmatched,
+        )

hdx/scraper/framework/utilities/sector_configuration.yaml ADDED Viewed

@@ -0,0 +1,138 @@
+sector:
+  dataset: "global-coordination-groups-beta"
+  resource: "Global Coordination Groups (Beta) CSV"
+  format: "csv"
+  headers: 2
+sector_map:
+  abna: "SHL"
+  abri: "SHL"
+  abri bna: "SHL"
+  abris: "SHL"
+  abris ame: "SHL"
+  abris bna: "SHL"
+  abris bna cccm: "SHL"
+  abris durgence et nfi: "SHL"
+  abris nfi: "SHL"
+  action contre les mines: "PRO-MIN"
+  aee: "SHL"
+  agriculture: "FSC"
+  agua saneamiento e higiene: "WSH"
+  all: "Intersectoral"
+  alojamiento de emergencia: "SHL"
+  alojamiento de emergencia shelter: "SHL"
+  alojamientos y asentamientos: "SHL"
+  ame: "SHL"
+  ash: "WSH"
+  assainissement: "WSH"
+  camp coordination and camp management: "CCM"
+  camp coordination camp management: "CCM"
+  cash: "Cash"
+  cccm: "CCM"
+  ccs: "CCM"
+  cluster coordination: "CCM"
+  coord services support: "CCM"
+  coordinacion informacion: "CCM"
+  coordination: "CCM"
+  coordination et gestion des camps: "CCM"
+  eah: "WSH"
+  eau: "WSH"
+  eau assainissement et hygiene: "WSH"
+  eau hygiene: "WSH"
+  eau hygiene assainissement: "WSH"
+  eau hygiene et assainissement: "WSH"
+  educacion: "EDU"
+  educacion en emergencias: "EDU"
+  education: "EDU"
+  eha: "WSH"
+  emergency shelter and non food items: "SHL"
+  epah: "WSH"
+  erl: "ERY"
+  esnfi: "SHL"
+  explosive hazards: "PRO-MIN"
+  food: "FSC"
+  food security and agriculture: "FSC"
+  food security and livelihoods: "FSC"
+  food security and nutrition: "FSC"
+  food security livelihood: "FSC"
+  fsl: "FSC"
+  gestion des sites daccueil temporaires: "SHL"
+  gbv: "PRO-GBV"
+  hlp: "PRO-HLP"
+  humanitaire: "Hum"
+  hygiene: "WSH"
+  hygiene assainissement: "WSH"
+  intercluster: "Multi"  # From Somalia 3W, hopefully not to be confused with intersectoral
+  logement terre et biens: "PRO-HLP"
+  logistica: "LOG"
+  logistique: "LOG"
+  manejo y gestion de campamentos: "CCM"
+  ms: "Multi"
+  multi secteur: "Multi"
+  multisectoriel: "Multi"
+  nutricion: "NUT"
+  nutrition: "NUT"
+  operatioanl presence water sanitation hygiene: "WSH"
+  operational presence education in emergencies: "EDU"
+  operational presence emergency shelter non food items: "SHL"
+  operational presence food security agriculture: "FSC"
+  operational presence health: "HEA"
+  operational presence nutrition: "NUT"
+  operational presence protection: "PRO"
+  pro cpm: "PRO-CPN"
+  pronna: "PRO-CPN"
+  propg: "PRO"
+  proteccion infantil: "PRO-CPN"
+  protection: "PRO"
+  protection de lenfance: "PRO-CPN"
+  protection de lenfant: "PRO-CPN"
+  protection generale: "PRO"
+  protection logement terre et propriete: "PRO-HLP"
+  protection ltb: "PRO-HLP"
+  protection lutte anti mines: "PRO-MIN"
+  protection pe: "PRO-CPN"
+  protection protection de lenfant: "PRO-CPN"
+  protection violences basees sur le genre: "PRO-GBV"
+  protection vgb: "PRO-GBV"
+  proteccion: "PRO"
+  provbg: "PRO-GBV"
+  psea: "PRO-GBV"
+  rapid response mechanism: "ERY"
+  rcf: "CCM"
+  rcf education: "EDU"
+  rcf food security and livelihoods: "FSC"
+  rcf health and nutrtion: "HEA"
+  rcf protection: "PRO"
+  recuperacion temprana: "ERY"
+  relevement precoce: "ERY"
+  relevement rapide: "ERY"
+  refugee response: "CCM"
+  refugees migrants multi sector: "CCM"
+  reponse aux refugies: "CCM"
+  sa: "FSC"
+  sal: "HEA"
+  salud: "HEA"
+  samv: "FSC"
+  sante: "HEA"
+  securite alimentaire: "FSC"
+  seguridad alimentaria: "FSC"
+  seguridad alimentaria y nutricion: "FSC"
+  services humanitaires communs: "Hum"
+  sexual and reproductive health: "HEA"
+  shelter: "SHL"
+  shelter nfi: "SHL"
+  shelter nfis: "SHL"
+  shelter and nfi: "SHL"
+  shelter and nfis: "SHL"
+  shelter and non food items: "SHL"
+  site management: "CCM"
+  snfi: "SHL"
+  telecommunications: "TEL"
+  telecommunications durgence: "TEL"
+  telecomunicaciones de emergencia: "TEL"
+  vbg: "PRO-GBV"
+  violences basees sur le genre: "PRO-GBV"
+  violence basee sur le genre: "PRO-GBV"
+  violencia basada en genero: "PRO-GBV"
+  wash: "WSH"
+  water sanitation and hygiene: "WSH"

hdx/scraper/{utilities → framework/utilities}/sources.py RENAMED Viewed

@@ -282,9 +282,9 @@ class Sources:
         if no_sources:
             source_configuration["no_sources"] = True
             return source_configuration
-        source_configuration[
-            "should_overwrite_sources"
-        ] = should_overwrite_sources
+        source_configuration["should_overwrite_sources"] = (
+            should_overwrite_sources
+        )
         if suffix_attribute:
             source_configuration["suffix_attribute"] = suffix_attribute
             return source_configuration

{hdx_python_scraper-2.3.4.dist-info → hdx_python_scraper-2.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: hdx-python-scraper
-Version: 2.3.4
+Version: 2.5.2
 Summary: HDX Python scraper utilities to assemble data from multiple sources
 Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
 Author-email: Michael Rans <rans@email.com>
@@ -26,13 +26,14 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Requires-Dist: gspread
-Requires-Dist: hdx-python-api>=6.2.1
-Requires-Dist: hdx-python-country>=3.6.4
+Requires-Dist: hdx-python-api>=6.3.7
+Requires-Dist: hdx-python-country>=3.8.6
+Requires-Dist: hdx-python-utilities>=3.8.2
 Requires-Dist: regex
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == 'dev'
 Provides-Extra: pandas
-Requires-Dist: pandas>=2.1.3; extra == 'pandas'
+Requires-Dist: pandas>=2.2.2; extra == 'pandas'
 Provides-Extra: test
 Requires-Dist: pytest; extra == 'test'
 Requires-Dist: pytest-cov; extra == 'test'

hdx_python_scraper-2.5.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,27 @@
+hdx/scraper/framework/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
+hdx/scraper/framework/_version.py,sha256=qrwMUvCUqANtlUPbnE5wPCDZujNKWYOaJRJsJky27Ac,411
+hdx/scraper/framework/base_scraper.py,sha256=vvwljQ5QWr6hpCjOS89RG1pvC955aLoPvm6pSovO75o,15432
+hdx/scraper/framework/runner.py,sha256=GFnZM9HciZFibwwRgDHVk9F_y2n27ctpRwyeD1_ZcKw,53538
+hdx/scraper/framework/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+hdx/scraper/framework/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
+hdx/scraper/framework/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
+hdx/scraper/framework/outputs/googlesheets.py,sha256=jLAfXz4usmLFrePxRIsMflxKPzSGv9T3jlMpSV-s4II,3087
+hdx/scraper/framework/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
+hdx/scraper/framework/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+hdx/scraper/framework/scrapers/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
+hdx/scraper/framework/scrapers/configurable_scraper.py,sha256=PYPtU9XZALNx-2Jr8a8kVVDsT2j9yGgBaw6wXhztQIM,20612
+hdx/scraper/framework/scrapers/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
+hdx/scraper/framework/scrapers/rowparser.py,sha256=bH05JUqViIVes9T7gWp0D2778BlFiJuNHmdovSFdFoI,15614
+hdx/scraper/framework/scrapers/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
+hdx/scraper/framework/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
+hdx/scraper/framework/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
+hdx/scraper/framework/utilities/reader.py,sha256=pQcGg5TIhl3c-QX_F1sZxY4Ar0N7TLalX38IMuCXA-0,26568
+hdx/scraper/framework/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
+hdx/scraper/framework/utilities/sector.py,sha256=rl_TceRYc5YRoLccr0ABCM42ZLLtLzezWWWQ5YtbQDE,1947
+hdx/scraper/framework/utilities/sector_configuration.yaml,sha256=LAUR5xfLU5qua5qtc3TcwEei0sD1zoCb_vfAxD7Grb8,3894
+hdx/scraper/framework/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
+hdx/scraper/framework/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
+hdx_python_scraper-2.5.2.dist-info/METADATA,sha256=zNHR55fmxnxl_0K3u7zAl1rz1molJ7DX6meBJmK49Es,3361
+hdx_python_scraper-2.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+hdx_python_scraper-2.5.2.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
+hdx_python_scraper-2.5.2.dist-info/RECORD,,

{hdx_python_scraper-2.3.4.dist-info → hdx_python_scraper-2.5.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.21.1
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

hdx_python_scraper-2.3.4.dist-info/RECORD DELETED Viewed

@@ -1,25 +0,0 @@
-hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
-hdx/scraper/_version.py,sha256=-yOUI-ZIjXgov3YpdPKmW_w-fIBrZtGytjk8Bz_DwDI,411
-hdx/scraper/base_scraper.py,sha256=oo9oMqCUpK8_hPwcTz2PAKabzoyU0BQu5dgWgsFa55Y,15431
-hdx/scraper/runner.py,sha256=3UoVi5jVRcex0U8gf1TTBLXGxisRPmCMSV8jUYHWZZM,52750
-hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
-hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
-hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
-hdx/scraper/configurable/scraper.py,sha256=4f4kNbG0HCIfPe1ft93T247s841rk1fP4cIpkFQ6NWU,20594
-hdx/scraper/configurable/timeseries.py,sha256=oAby_sGL6NmRoKnDG_fMB952W9zvzujPIsXkbqcXv-o,3027
-hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
-hdx/scraper/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
-hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1Ep39QY,3087
-hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
-hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
-hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
-hdx/scraper/utilities/reader.py,sha256=hexLIJW3CdP4DmobqMM-Z2d6pgcCs1zWWBW-stqoeNU,22975
-hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
-hdx/scraper/utilities/sources.py,sha256=VNhFYSUM2xeDlN6y4Ya9_0BskjPtjwQZmCKnQgpOemQ,11511
-hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
-hdx_python_scraper-2.3.4.dist-info/METADATA,sha256=fCv1Y7-m0IgaLUhfNddwjCPEnl7tOheLDntDhngefQc,3318
-hdx_python_scraper-2.3.4.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
-hdx_python_scraper-2.3.4.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
-hdx_python_scraper-2.3.4.dist-info/RECORD,,