PyPI - hdx-python-scraper - Versions diffs - 2.3.5__py3-none-any.whl → 2.3.7__py3-none-any.whl - Mend

hdx-python-scraper 2.3.5py3-none-any.whl → 2.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

hdx/scraper/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2.3.5'
-__version_tuple__ = version_tuple = (2, 3, 5)
+__version__ = version = '2.3.7'
+__version_tuple__ = version_tuple = (2, 3, 7)

hdx/scraper/base_scraper.py CHANGED Viewed

@@ -141,9 +141,9 @@ class BaseScraper(ABC):
             "should_overwrite_sources"
         )
         if should_overwrite_sources is not None:
-            self.source_configuration[
-                "should_overwrite_sources"
-            ] = should_overwrite_sources
+            self.source_configuration["should_overwrite_sources"] = (
+                should_overwrite_sources
+            )
         source = self.datasetinfo["source"]
         if isinstance(source, str):
             source = {"default_source": source}

hdx/scraper/utilities/reader.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import glob
 import logging
 from datetime import datetime
 from os.path import join
@@ -10,6 +11,7 @@ from slugify import slugify
 from . import get_startend_dates_from_time_period, match_template
 from .sources import Sources
+from hdx.api.configuration import Configuration
 from hdx.data.dataset import Dataset
 from hdx.data.resource import Resource
 from hdx.utilities.dateparse import parse_date
@@ -238,11 +240,14 @@ class Read(Retrieve):
             **kwargs,
         )
-    def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
+    def read_dataset(
+        self, dataset_name: str, configuration: Optional[Configuration] = None
+    ) -> Optional[Dataset]:
         """Read HDX dataset
         Args:
             dataset_name (str): Dataset name
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
         Returns:
             Optional[Dataset]: The dataset that was read or None
@@ -252,7 +257,7 @@ class Read(Retrieve):
             logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
             dataset = Dataset.load_from_json(saved_path)
         else:
-            dataset = Dataset.read_from_hdx(dataset_name)
+            dataset = Dataset.read_from_hdx(dataset_name, configuration)
             if self.save:
                 logger.info(f"Saving dataset {dataset_name} in {saved_path}")
                 if dataset is None:
@@ -261,6 +266,56 @@ class Read(Retrieve):
                     dataset.save_to_json(saved_path, follow_urls=True)
         return dataset
+    def search_datasets(
+        self,
+        filename: str,
+        query: Optional[str] = "*:*",
+        configuration: Optional[Configuration] = None,
+        page_size: int = 1000,
+        **kwargs: Any,
+    ) -> List[Dataset]:
+        """Read HDX dataset
+        Args:
+            filename (str): Filename for saved files. Will be prefixed by underscore and a number.
+            query (Optional[str]): Query (in Solr format). Defaults to '*:*'.
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
+            page_size (int): Size of page to return. Defaults to 1000.
+            **kwargs: See below
+            fq (string): Any filter queries to apply
+            rows (int): Number of matching rows to return. Defaults to all datasets (sys.maxsize).
+            start (int): Offset in the complete result for where the set of returned datasets should begin
+            sort (string): Sorting of results. Defaults to 'relevance asc, metadata_modified desc' if rows<=page_size or 'metadata_modified asc' if rows>page_size.
+            facet (string): Whether to enable faceted results. Default to True.
+            facet.mincount (int): Minimum counts for facet fields should be included in the results
+            facet.limit (int): Maximum number of values the facet fields return (- = unlimited). Defaults to 50.
+            facet.field (List[str]): Fields to facet upon. Default is empty.
+            use_default_schema (bool): Use default package schema instead of custom schema. Defaults to False.
+        Returns:
+            List[Dataset]: list of datasets resulting from query
+        """
+        saved_path = join(self.saved_dir, filename)
+        if self.use_saved:
+            logger.info(
+                f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
+            )
+            datasets = []
+            for file_path in glob.glob(f"{saved_path}_*.json"):
+                datasets.append(Dataset.load_from_json(file_path))
+        else:
+            datasets = Dataset.search_in_hdx(
+                query, configuration, page_size, **kwargs
+            )
+            if self.save:
+                for i, dataset in enumerate(datasets):
+                    file_path = f"{saved_path}_{i}.json"
+                    name = dataset["name"]
+                    logger.info(f"Saving dataset {name} in {file_path}")
+                    dataset.save_to_json(file_path, follow_urls=True)
+        return datasets
     @staticmethod
     def construct_filename(name: str, format: str):
         """Construct filename from name and format. The filename of the file
@@ -438,7 +493,10 @@ class Read(Retrieve):
         return self.hxl_info_file(name, format, url, **kwargs)
     def read_hdx_metadata(
-        self, datasetinfo: Dict, do_resource_check: bool = True
+        self,
+        datasetinfo: Dict,
+        do_resource_check: bool = True,
+        configuration: Optional[Configuration] = None,
     ) -> Optional[Resource]:
         """Read metadata from HDX dataset and add to input dictionary. If url
         is not supplied, will look through resources for one that matches
@@ -454,13 +512,14 @@ class Read(Retrieve):
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
             do_resource_check (bool): Whether to check resources. Defaults to False.
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
         Returns:
             Optional[Resource]: The resource if a url was not given
         """
         dataset_nameinfo = datasetinfo["dataset"]
         if isinstance(dataset_nameinfo, str):
-            dataset = self.read_dataset(dataset_nameinfo)
+            dataset = self.read_dataset(dataset_nameinfo, configuration)
             resource = None
             url = datasetinfo.get("url")
             resource_name = datasetinfo.get("resource")
@@ -491,24 +550,24 @@ class Read(Retrieve):
                 else:
                     url = resource["url"]  # otherwise set the url key in
                     # datasetinfo to the resource url (by setting url here)
-                datasetinfo[
-                    "hapi_resource_metadata"
-                ] = self.get_hapi_resource_metadata(resource)
+                datasetinfo["hapi_resource_metadata"] = (
+                    self.get_hapi_resource_metadata(resource)
+                )
                 datasetinfo["url"] = url
             if "source_date" not in datasetinfo:
-                datasetinfo[
-                    "source_date"
-                ] = get_startend_dates_from_time_period(
-                    dataset, today=self.today
+                datasetinfo["source_date"] = (
+                    get_startend_dates_from_time_period(
+                        dataset, today=self.today
+                    )
                 )
             if "source" not in datasetinfo:
                 datasetinfo["source"] = dataset["dataset_source"]
             if "source_url" not in datasetinfo:
                 datasetinfo["source_url"] = dataset.get_hdx_url()
             Sources.standardise_datasetinfo_source_date(datasetinfo)
-            datasetinfo[
-                "hapi_dataset_metadata"
-            ] = self.get_hapi_dataset_metadata(dataset, datasetinfo)
+            datasetinfo["hapi_dataset_metadata"] = (
+                self.get_hapi_dataset_metadata(dataset, datasetinfo)
+            )
             return resource
         if "source_date" not in datasetinfo:
@@ -527,7 +586,7 @@ class Read(Retrieve):
         for hxltag, dataset_name in dataset_nameinfo.items():
             dataset = datasets.get(dataset_name)
             if not dataset:
-                dataset = self.read_dataset(dataset_name)
+                dataset = self.read_dataset(dataset_name, configuration)
                 datasets[dataset_name] = dataset
             if source_date is not None:
                 if hxltag == "default_dataset":
@@ -561,18 +620,22 @@ class Read(Retrieve):
     def read_hdx(
         self,
         datasetinfo: Dict,
+        configuration: Optional[Configuration] = None,
         **kwargs: Any,
     ) -> Tuple[List[str], Iterator[Dict]]:
         """Read data and metadata from HDX dataset
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
             **kwargs: Parameters to pass to download_file call
         Returns:
             Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
         """
-        resource = self.read_hdx_metadata(datasetinfo)
+        resource = self.read_hdx_metadata(
+            datasetinfo, configuration=configuration
+        )
         filename = kwargs.get("filename")
         if filename:
             del kwargs["filename"]
@@ -593,12 +656,14 @@ class Read(Retrieve):
     def read(
         self,
         datasetinfo: Dict,
+        configuration: Optional[Configuration] = None,
         **kwargs: Any,
     ) -> Tuple[List[str], Iterator[Dict]]:
         """Read data and metadata from HDX dataset
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
+            configuration (Optional[Configuration]): HDX configuration. Defaults to global configuration.
             **kwargs: Parameters to pass to download_file call
         Returns:
@@ -607,7 +672,9 @@ class Read(Retrieve):
         format = datasetinfo["format"]
         if format in ["json", "csv", "xls", "xlsx"]:
             if "dataset" in datasetinfo:
-                headers, iterator = self.read_hdx(datasetinfo, **kwargs)
+                headers, iterator = self.read_hdx(
+                    datasetinfo, configuration, **kwargs
+                )
             else:
                 headers, iterator = self.read_tabular(datasetinfo, **kwargs)
         else:

hdx/scraper/utilities/sources.py CHANGED Viewed

@@ -282,9 +282,9 @@ class Sources:
         if no_sources:
             source_configuration["no_sources"] = True
             return source_configuration
-        source_configuration[
-            "should_overwrite_sources"
-        ] = should_overwrite_sources
+        source_configuration["should_overwrite_sources"] = (
+            should_overwrite_sources
+        )
         if suffix_attribute:
             source_configuration["suffix_attribute"] = suffix_attribute
             return source_configuration

{hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: hdx-python-scraper
-Version: 2.3.5
+Version: 2.3.7
 Summary: HDX Python scraper utilities to assemble data from multiple sources
 Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
 Author-email: Michael Rans <rans@email.com>
@@ -26,8 +26,9 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Requires-Dist: gspread
-Requires-Dist: hdx-python-api>=6.2.1
-Requires-Dist: hdx-python-country>=3.6.4
+Requires-Dist: hdx-python-api>=6.2.8
+Requires-Dist: hdx-python-country>=3.7.0
+Requires-Dist: hdx-python-utilities>=3.6.8
 Requires-Dist: regex
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == 'dev'

{hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
-hdx/scraper/_version.py,sha256=-9aYLvgAp04zL8yFAMPjvf6kLKgqW1mLgyuk6XA3LcE,411
-hdx/scraper/base_scraper.py,sha256=oo9oMqCUpK8_hPwcTz2PAKabzoyU0BQu5dgWgsFa55Y,15431
+hdx/scraper/_version.py,sha256=SH_yCAX65tCK8PRP8gyPvUcp4HPVksM4fKEz1rXjzjM,411
+hdx/scraper/base_scraper.py,sha256=2eJifpb8G_KtEb9Z273suDCiMPteJsCBHwDEk3o0wA8,15433
 hdx/scraper/runner.py,sha256=v5ToiTBOvFbkMOcBAoWGmDyO5bhGooTL8pPIt3BIQ8Y,53550
 hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
 hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
 hdx/scraper/utilities/__init__.py,sha256=dvbp0qTV-kLvN4Xp0GQf8LjN3IqlytW1eaTmDjlyZy0,2391
 hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
-hdx/scraper/utilities/reader.py,sha256=hexLIJW3CdP4DmobqMM-Z2d6pgcCs1zWWBW-stqoeNU,22975
+hdx/scraper/utilities/reader.py,sha256=03S53U1GylPaeRoqEj3TT5UgiKTwVODUx3IETwCb9ps,26364
 hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
-hdx/scraper/utilities/sources.py,sha256=VNhFYSUM2xeDlN6y4Ya9_0BskjPtjwQZmCKnQgpOemQ,11511
+hdx/scraper/utilities/sources.py,sha256=KuhaTvvGzjuw0dbhWpmPFvSq5RWP9cY83nl687O3CSs,11513
 hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
-hdx_python_scraper-2.3.5.dist-info/METADATA,sha256=jYBTVEB111S1R3Cj8fZByzM4E3nRRKCr31bsCPstjPA,3318
-hdx_python_scraper-2.3.5.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
-hdx_python_scraper-2.3.5.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
-hdx_python_scraper-2.3.5.dist-info/RECORD,,
+hdx_python_scraper-2.3.7.dist-info/METADATA,sha256=Nw-xgPumG7UzJw3M1D5G9kZeUgZObM3m8mkkA1kutqg,3361
+hdx_python_scraper-2.3.7.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+hdx_python_scraper-2.3.7.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
+hdx_python_scraper-2.3.7.dist-info/RECORD,,

{hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.21.1
+Generator: hatchling 1.24.2
 Root-Is-Purelib: true
 Tag: py3-none-any

{hdx_python_scraper-2.3.5.dist-info → hdx_python_scraper-2.3.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

hdx-python-scraper 2.3.5__py3-none-any.whl → 2.3.7__py3-none-any.whl

hdx-python-scraper 2.3.5py3-none-any.whl → 2.3.7py3-none-any.whl