PyPI - hdx-python-scraper - Versions diffs - 2.1.9__py3-none-any.whl → 2.2.1__py3-none-any.whl - Mend

hdx-python-scraper 2.1.9py3-none-any.whl → 2.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

hdx/scraper/_version.py CHANGED Viewed

@@ -1,4 +1,16 @@
 # file generated by setuptools_scm
 # don't change, don't track in version control
-__version__ = version = '2.1.9'
-__version_tuple__ = version_tuple = (2, 1, 9)
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '2.2.1'
+__version_tuple__ = version_tuple = (2, 2, 1)

hdx/scraper/base_scraper.py CHANGED Viewed

@@ -346,14 +346,23 @@ class BaseScraper(ABC):
         """
         return self.source_urls
-    def get_hapi_metadata(self) -> Optional[Dict]:
+    def get_hapi_dataset_metadata(self) -> Optional[Dict]:
         """
-        Get HAPI metadata
+        Get HAPI dataset metadata
         Returns:
-            Optional[Dict]: HAPI metadata
+            Optional[Dict]: HAPI dataset metadata
         """
-        return self.datasetinfo.get("hapi_metadata")
+        return self.datasetinfo.get("hapi_dataset_metadata")
+    def get_hapi_resource_metadata(self) -> Optional[Dict]:
+        """
+        Get HAPI resource metadata
+        Returns:
+            Optional[Dict]: HAPI resource metadata
+        """
+        return self.datasetinfo.get("hapi_resource_metadata")
     def add_population(self) -> None:
         """

hdx/scraper/configurable/scraper.py CHANGED Viewed

@@ -122,6 +122,7 @@ class ConfigurableScraper(BaseScraper):
                     "input": datasetinfo.get("input", []),
                     "transform": datasetinfo.get("transform", {}),
                     "population_key": datasetinfo.get("population_key"),
+                    "list": datasetinfo.get("list", []),
                     "process": datasetinfo.get("process", []),
                     "input_keep": datasetinfo.get("input_keep", []),
                     "input_append": datasetinfo.get("input_append", []),
@@ -292,6 +293,7 @@ class ConfigurableScraper(BaseScraper):
                 filter = subset["filter"]
                 input_ignore_vals = subset.get("input_ignore_vals", [])
                 input_transforms = subset.get("transform", {})
+                list_cols = subset.get("list")
                 sum_cols = subset.get("sum")
                 process_cols = subset.get("process")
                 input_append = subset.get("input_append", [])
@@ -304,6 +306,8 @@ class ConfigurableScraper(BaseScraper):
                         val = eval(input_transform.replace(valcol, "val"))
                     if sum_cols or process_cols:
                         dict_of_lists_add(valuedict, adm, val)
+                    elif list_cols and valcol in list_cols:
+                        dict_of_lists_add(valuedict, adm, val)
                     else:
                         curval = valuedict.get(adm)
                         if valcol in input_append:
@@ -326,6 +330,7 @@ class ConfigurableScraper(BaseScraper):
                 population_str = "self.population_lookup[adm]"
             else:
                 population_str = "self.population_lookup[population_key]"
+            subset.get("list")
             process_cols = subset.get("process")
             input_keep = subset.get("input_keep", [])
             sum_cols = subset.get("sum")
@@ -440,7 +445,7 @@ class ConfigurableScraper(BaseScraper):
                             valcols[i], f"newvaldicts[{i}][adm]"
                         )
                     formula = formula.replace("#pzbgvjh", population_str)
-                    for adm in valdicts[0].keys():
+                    for adm in valdicts[0]:
                         try:
                             val = eval(formula)
                         except (ValueError, TypeError, KeyError):

hdx/scraper/runner.py CHANGED Viewed

@@ -1146,51 +1146,67 @@ class Runner:
     def get_hapi_metadata(
         self, names: Optional[ListTuple[str]] = None
-    ) -> List[Dict]:
-        """Get HAPI metadata for all datasets
+    ) -> Dict:
+        """Get HAPI metadata for all datasets. A dictionary is returned that
+        maps from dataset ids to a dictionary. The dictionary has keys for
+        dataset metadata and a key resources under which is a dictionary that
+        maps from resource ids to resource metadata.
         Args:
             names (Optional[ListTuple[str]]): Names of scrapers
         Returns:
-            List[Dict]: HAPI metadata for all datasets
+            Dict: HAPI metadata for all datasets
         """
         if not names:
             names = self.scrapers.keys()
-        hapi_metadata_list = []
+        results = {}
         for name in names:
             scraper = self.get_scraper(name)
             if not scraper.has_run:
                 continue
-            hapi_metadata = scraper.get_hapi_metadata()
-            if hapi_metadata:
-                hapi_metadata_list.append(hapi_metadata)
-        return hapi_metadata_list
+            hapi_dataset_metadata = scraper.get_hapi_dataset_metadata()
+            hapi_resource_metadata = scraper.get_hapi_resource_metadata()
+            dataset_id = hapi_dataset_metadata["hdx_id"]
+            resource_id = hapi_resource_metadata["hdx_id"]
+            hapi_metadata = results.get(
+                dataset_id, copy(hapi_dataset_metadata)
+            )
+            hapi_resources = hapi_metadata.get("resources", {})
+            hapi_resources[resource_id] = hapi_resource_metadata
+            hapi_metadata["resources"] = hapi_resources
+            results[dataset_id] = hapi_metadata
+        return results
     def get_hapi_results(
         self,
         names: Optional[ListTuple[str]] = None,
         has_run: bool = True,
-    ) -> List[Dict]:
-        """Get the results (headers, values and HAPi metadata) for scrapers
-        limiting to those in names if given and limiting further to those that
-        have been set in the constructor if previously given. By default only
-        scrapers marked as having run are returned unless has_run is set to
-        False. A list of dictionaries is returned where each dictionary has
-        keys headers, values, HAPI metadata and fallbacks. Headers is
-        a tuple of (column headers, hxl hashtags). Values, sources and
-        fallbacks are all lists.
+    ) -> Dict:
+        """Get the results (headers and values per admin level and HAPI
+        metadata) for scrapers limiting to those in names if given and limiting
+        further to those that have been set in the constructor if previously
+        given. By default, only scrapers marked as having run are returned
+        unless has_run is set to False.
+        A dictionary is returned where key is HDX dataset id and value is a
+        dictionary that has HAPI dataset metadata as well as a results key.
+        The value associated with the results key is a dictionary where each
+        key is an admin level. Each admin level key has a value dictionary with
+        headers, values and HAPI resource metadata. Headers is a tuple of
+        (column headers, hxl hashtags). Values is a list. HAPI resource
+        metadata is a dictionary.
         Args:
             names (Optional[ListTuple[str]]): Names of scrapers. Defaults to None (all scrapers).
             has_run (bool): Only get results for scrapers marked as having run. Defaults to True.
         Returns:
-            List[Dict]: Headers, values and HAPI metadata for all datasets
+            Dict: Headers and values per admin level and HAPI metadata for all datasets
         """
         if not names:
             names = self.scrapers.keys()
-        results = []
+        results = {}
         def add_results(scraper_level, scrap, levels_used):
             nonlocal results
@@ -1201,11 +1217,21 @@ class Runner:
             if headers is None:
                 return
             values = scrap.get_values(scraper_level)
-            hapi_metadata = copy(scrap.get_hapi_metadata())
-            hapi_metadata["headers"] = headers
-            hapi_metadata["values"] = values
+            hapi_dataset_metadata = scrap.get_hapi_dataset_metadata()
+            hapi_resource_metadata = scrap.get_hapi_resource_metadata()
+            dataset_id = hapi_dataset_metadata["hdx_id"]
+            hapi_metadata = results.get(
+                dataset_id, copy(hapi_dataset_metadata)
+            )
+            level_results = hapi_metadata.get("results", {})
+            level_results[scraper_level] = {
+                "headers": headers,
+                "values": values,
+                "hapi_resource_metadata": hapi_resource_metadata,
+            }
+            hapi_metadata["results"] = level_results
             levels_used.add(scraper_level)
-            results.append(hapi_metadata)
+            results[dataset_id] = hapi_metadata
         for name in names:
             if self.scrapers_to_run and not any(

hdx/scraper/utilities/reader.py CHANGED Viewed

@@ -340,9 +340,16 @@ class Read(Retrieve):
     def read_hdx_metadata(
         self, datasetinfo: Dict, do_resource_check: bool = True
     ) -> Optional[Resource]:
-        """Read metadata from HDX dataset and add to input dictionary. If url is not
-        supplied, will look through resources for one that matches specified format and
-        use its url unless do_resource_check is False.
+        """Read metadata from HDX dataset and add to input dictionary. If url
+        is not supplied, will look through resources for one that matches
+        specified format and use its url unless do_resource_check is False.
+        The dataset key of the parameter datasetinfo will usually point to a
+        string (single dataset) but where sources vary across HXL tags can be
+        a dictionary that maps from HXL tags to datasets with the key
+        default_dataset setting a default for HXL tags. For a single dataset,
+        the keys hapi_dataset_metadata and hapi_resource_metadata will be
+        populated with more detailed dataset and resource information required
+        by HAPI.
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
@@ -354,7 +361,9 @@ class Read(Retrieve):
         dataset_nameinfo = datasetinfo["dataset"]
         if isinstance(dataset_nameinfo, str):
             dataset = self.read_dataset(dataset_nameinfo)
-            hapi_metadata = self.get_hapi_dataset_metadata(dataset)
+            datasetinfo[
+                "hapi_dataset_metadata"
+            ] = self.get_hapi_dataset_metadata(dataset)
             resource = None
             url = datasetinfo.get("url")
             if do_resource_check and not url:
@@ -365,8 +374,8 @@ class Read(Retrieve):
                         if resource_name and resource["name"] != resource_name:
                             continue
                         url = resource["url"]
-                        hapi_metadata[
-                            "resource"
+                        datasetinfo[
+                            "hapi_resource_metadata"
                         ] = self.get_hapi_resource_metadata(resource)
                         break
                 if not url:
@@ -374,7 +383,6 @@ class Read(Retrieve):
                         f"Cannot find {format} resource in {dataset_nameinfo}!"
                     )
                 datasetinfo["url"] = url
-            datasetinfo["hapi_metadata"] = hapi_metadata
             if "source_date" not in datasetinfo:
                 datasetinfo[
                     "source_date"

{hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: hdx-python-scraper
-Version: 2.1.9
+Version: 2.2.1
 Summary: HDX Python scraper utilities to assemble data from multiple sources
 Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
 Author-email: Michael Rans <rans@email.com>
@@ -26,12 +26,12 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Requires-Dist: gspread
-Requires-Dist: hdx-python-api>=6.1.1
+Requires-Dist: hdx-python-api>=6.1.3
 Requires-Dist: regex
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == 'dev'
 Provides-Extra: pandas
-Requires-Dist: pandas>=2.0.3; extra == 'pandas'
+Requires-Dist: pandas>=2.1.1; extra == 'pandas'
 Provides-Extra: test
 Requires-Dist: pytest; extra == 'test'
 Requires-Dist: pytest-cov; extra == 'test'

{hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
-hdx/scraper/_version.py,sha256=MpnaybiTOEFk89HrKfy8TfZn_07O2pxn6coEIcViaoI,160
-hdx/scraper/base_scraper.py,sha256=o_r8xl8piArJrNan3RsN-BV9HDqchcDQUQeYKwcw4vg,14345
-hdx/scraper/runner.py,sha256=MZt8Omk-lUpUOzKmXnra77Ljze-xwZ-crFltYdFAIts,49796
+hdx/scraper/_version.py,sha256=R_Wr7clGXr8a07n6uqFj88MyYFGydFRXYBI10R9k_uw,411
+hdx/scraper/base_scraper.py,sha256=IaUDqnrSxB0kbEQynX-81NEyv9DLxypWKwEDAEr9GWg,14628
+hdx/scraper/runner.py,sha256=-7L-L9WGZdTGl5mWNAPgvpTreU9bvbdxklruGCRzjRs,51217
 hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
 hdx/scraper/configurable/resource_downloader.py,sha256=vK8zNFy7T_Rj1h8Tj676-3B2oYYXFNKsrM9dxz7RZC8,1537
 hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
-hdx/scraper/configurable/scraper.py,sha256=kVQpVMHEYizSq94PeSP119gi24a1XR_mzL4mtlAvK2M,20217
+hdx/scraper/configurable/scraper.py,sha256=TyB7ipTzhVpOC3in0ZBIMwbcTAOR0Ul-W6Np85NnogI,20468
 hdx/scraper/configurable/timeseries.py,sha256=uhnENo7Wsy0-YVjglm7OQkXI72-te61DkepkihbQrP8,2982
 hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
 hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
 hdx/scraper/utilities/__init__.py,sha256=iBjD7bc8wEzQhwkcx2mOZwYmu28VHjl5px66quqWJ8E,2491
 hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
-hdx/scraper/utilities/reader.py,sha256=qYL5jTkhBOmZBe5AwA_7B2KTefSwlVkvGhvXAaOlaJA,17850
+hdx/scraper/utilities/reader.py,sha256=awm24AUWlweJmJVE1h0iid7xb6njvF7Taf0afbGXIG4,18331
 hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
 hdx/scraper/utilities/sources.py,sha256=h27PjBADqIhqDwmhzMXt1OjwJWZc2iVnIBwJuAJKHwo,11204
 hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
-hdx_python_scraper-2.1.9.dist-info/METADATA,sha256=de89qL7_O7_63htWg_72uKBe4cJ9oz_Zsf8PZEmGSVc,3289
-hdx_python_scraper-2.1.9.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
-hdx_python_scraper-2.1.9.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
-hdx_python_scraper-2.1.9.dist-info/RECORD,,
+hdx_python_scraper-2.2.1.dist-info/METADATA,sha256=hnYCmTG7ZlGqfc4QKCHjBKSesZ2q7ooTbdtyAuuhkqs,3289
+hdx_python_scraper-2.2.1.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
+hdx_python_scraper-2.2.1.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
+hdx_python_scraper-2.2.1.dist-info/RECORD,,

{hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{hdx_python_scraper-2.1.9.dist-info → hdx_python_scraper-2.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

hdx-python-scraper 2.1.9__py3-none-any.whl → 2.2.1__py3-none-any.whl

hdx-python-scraper 2.1.9py3-none-any.whl → 2.2.1py3-none-any.whl