PyPI - hdx-python-scraper - Versions diffs - 2.2.3__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

hdx-python-scraper 2.2.3py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

hdx/scraper/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2.2.3'
-__version_tuple__ = version_tuple = (2, 2, 3)
+__version__ = version = '2.3.0'
+__version_tuple__ = version_tuple = (2, 3, 0)

hdx/scraper/base_scraper.py CHANGED Viewed

@@ -81,15 +81,12 @@ class BaseScraper(ABC):
         self.sources: Dict[str, List] = {level: [] for level in self.headers}
         self.source_configuration = deepcopy(source_configuration)
-    def get_reader(
-        self, name: Optional[str] = None, prefix: Optional[str] = None
-    ):
-        """Get reader given name if provided or using name member variable if not.
-        Set reader prefix to given prefix or name if not provided.
+    def get_reader(self, name: Optional[str] = None):
+        """Get reader given name if provided or using name member variable if
+        not.
         Args:
             name (str): Name of scraper
-            prefix (Optional[str]): Prefix to use. Defaults to None (use scraper name).
         Returns:
              None
@@ -97,9 +94,6 @@ class BaseScraper(ABC):
         if not name:
             name = self.name
         reader = Read.get_reader(name)
-        if not prefix:
-            prefix = name
-        reader.prefix = prefix
         return reader
     def get_headers(self, level: str) -> Optional[Tuple[Tuple]]:
@@ -362,6 +356,24 @@ class BaseScraper(ABC):
         Returns:
             Optional[Dict]: HAPI resource metadata
         """
+        hapi_resource_metadata = self.datasetinfo.get("hapi_resource_metadata")
+        if not hapi_resource_metadata:
+            return None
+        if "is_hxl" in hapi_resource_metadata:
+            return hapi_resource_metadata
+        reader = self.get_reader(self.name)
+        filename = self.datasetinfo.get("filename")
+        hxl_info = reader.hxl_info_hapi_resource_metadata(
+            hapi_resource_metadata, filename=filename, file_prefix=self.name
+        )
+        is_hxl = False
+        if hxl_info:
+            for sheet in hxl_info.get("sheets", ()):
+                if sheet["is_hxlated"]:
+                    is_hxl = True
+                    break
+        hapi_resource_metadata["is_hxl"] = is_hxl
         return self.datasetinfo.get("hapi_resource_metadata")
     def add_population(self) -> None:

hdx/scraper/configurable/resource_downloader.py CHANGED Viewed

@@ -33,7 +33,7 @@ class ResourceDownloader(BaseScraper):
         """
         reader = self.get_reader("hdx")
         resource = reader.read_hdx_metadata(self.datasetinfo)
-        url, path = reader.download_resource(self.name, resource)
+        url, path = reader.download_resource(resource, file_prefix=self.name)
         logger.info(f"Downloading {url} to {path}")
         copy2(path, join(self.folder, self.datasetinfo["filename"]))

hdx/scraper/configurable/scraper.py CHANGED Viewed

@@ -147,6 +147,7 @@ class ConfigurableScraper(BaseScraper):
         """
         return self.get_reader(name).read(
             self.datasetinfo,
+            file_prefix=name,
             **self.variables,
         )

hdx/scraper/configurable/timeseries.py CHANGED Viewed

@@ -50,7 +50,9 @@ class TimeSeries(BaseScraper):
             "output_hxl"
         ]
         rows = [headers, hxltags]
-        file_headers, iterator = self.get_reader().read(self.datasetinfo)
+        file_headers, iterator = self.get_reader(self.name).read(
+            self.datasetinfo, file_prefix=self.name
+        )
         for inrow in iterator:
             if isinstance(datecol, list):
                 dates = [str(inrow[x]) for x in datecol]

hdx/scraper/runner.py CHANGED Viewed

@@ -1145,7 +1145,9 @@ class Runner:
         return sorted(source_urls)
     def get_hapi_metadata(
-        self, names: Optional[ListTuple[str]] = None
+        self,
+        names: Optional[ListTuple[str]] = None,
+        has_run: bool = True,
     ) -> Dict:
         """Get HAPI metadata for all datasets. A dictionary is returned that
         maps from dataset ids to a dictionary. The dictionary has keys for
@@ -1154,6 +1156,7 @@ class Runner:
         Args:
             names (Optional[ListTuple[str]]): Names of scrapers
+            has_run (bool): Only get results for scrapers marked as having run. Defaults to True.
         Returns:
             Dict: HAPI metadata for all datasets
@@ -1163,7 +1166,7 @@ class Runner:
         results = {}
         for name in names:
             scraper = self.get_scraper(name)
-            if not scraper.has_run:
+            if has_run and not scraper.has_run:
                 continue
             hapi_dataset_metadata = scraper.get_hapi_dataset_metadata()
             hapi_resource_metadata = scraper.get_hapi_resource_metadata()

hdx/scraper/utilities/reader.py CHANGED Viewed

@@ -194,7 +194,7 @@ class Read(Retrieve):
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
-            **kwargs: Variables to use when evaluating template arguments
+            **kwargs: Parameters to pass to download_file call
         Returns:
             Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
@@ -219,6 +219,11 @@ class Read(Retrieve):
         url = datasetinfo["url"]
         if isinstance(url, list):
             url = [self.get_url(x, **kwargs) for x in url]
+        filename = kwargs.get("filename")
+        if not filename:
+            filename = datasetinfo.get("filename")
+            if filename:
+                kwargs["filename"] = filename
         return self.get_tabular_rows(
             url,
             dict_form=True,
@@ -249,57 +254,67 @@ class Read(Retrieve):
                     dataset.save_to_json(saved_path, follow_urls=True)
         return dataset
-    def download_resource(
-        self, identifier: str, resource: Resource
-    ) -> Tuple[str, str]:
-        """Download HDX resource os a file and return the url downloaded and the path
-        of the file. The identifier is information to identify what called
-        this function and is used to prefix the filename of the file.
+    @staticmethod
+    def construct_filename(name: str, format: str):
+        """Construct filename from name and format. The filename of the file
+        comes from the name and format.
         Args:
-            identifier (str): Information to identify caller
-            resource (Resource): HDX resource
+            name (str): Name for the download
+            format (str): Format of download
         Returns:
-            Tuple[str, str]: (URL that was downloaded, path to downloaded file)
+            str: Filename of file
         """
-        filename = f"{identifier}_{resource['name'].lower()}"
-        file_type = f".{resource.get_file_type()}"
+        filename = name.lower()
+        file_type = f".{format}"
         if filename.endswith(file_type):
             filename = filename[: -len(file_type)]
-        filename = f"{slugify(filename, separator='_')}{file_type}"
-        url = munge_url(resource["url"], InputOptions())
-        path = self.download_file(url, filename=filename)
+        return f"{slugify(filename, separator='_')}{file_type}"
+    def construct_filename_and_download(
+        self, name: str, format: str, url: str, **kwargs: Any
+    ) -> Tuple[str, str]:
+        """Construct filename, download file and return the url downloaded and
+        the path of the file. The filename of the file comes from the name and
+        format.
+        Args:
+            name (str): Name for the download
+            format (str): Format of download
+            url (str): URL of download
+            **kwargs: Parameters to pass to download_file call
+        Returns:
+            Tuple[str, str]: (URL that was downloaded, path to downloaded file)
+        """
+        filename = kwargs.get("filename")
+        if not filename:
+            kwargs["filename"] = self.construct_filename(name, format)
+        url = munge_url(url, InputOptions())
+        path = self.download_file(url, **kwargs)
         return url, path
-    def read_hxl_resource(
-        self, identifier: str, resource: Resource, data_type: str
-    ) -> Optional[hxl.Dataset]:
-        """Read HDX resource as a HXL dataset. The identifier is information to identify
-        what called this function and is used to prefix the filename of the file and for
-        logging.
+    def download_resource(
+        self, resource: Resource, **kwargs: Any
+    ) -> Tuple[str, str]:
+        """Download HDX resource os a file and return the url downloaded and
+        the path of the file. The filename of the file comes from the name and
+        format.
         Args:
-            identifier (str): Information to identify caller
             resource (Resource): HDX resource
-            data_type (str): Description of the type of data for logging
+            **kwargs: Parameters to pass to download_file call
         Returns:
-            Optional[hxl.Dataset]: HXL dataset or None
+            Tuple[str, str]: (URL that was downloaded, path to downloaded file)
         """
-        try:
-            _, path = self.download_resource(identifier, resource)
-            data = hxl.data(path, InputOptions(allow_local=True)).cache()
-            data.display_tags
-            return data
-        except hxl.HXLException:
-            logger.warning(
-                f"Could not process {data_type} for {identifier}. Maybe there are no HXL tags?"
-            )
-            return None
-        except Exception:
-            logger.exception(f"Error reading {data_type} for {identifier}!")
-            raise
+        return self.construct_filename_and_download(
+            resource["name"],
+            resource.get_file_type(),
+            resource["url"],
+            **kwargs,
+        )
     def get_hapi_dataset_metadata(self, dataset: Dataset) -> Dict:
         """Get HAPI dataset metadata from HDX dataset
@@ -337,6 +352,82 @@ class Read(Retrieve):
             "download_url": resource["url"],
         }
+    def read_hxl_resource(
+        self, resource: Resource, **kwargs: Any
+    ) -> Optional[hxl.Dataset]:
+        """Read HDX resource as a HXL dataset.
+        Args:
+            resource (Resource): HDX resource
+            **kwargs: Parameters to pass to download_file call
+        Returns:
+            Optional[hxl.Dataset]: HXL dataset or None
+        """
+        url = resource["url"]
+        try:
+            _, path = self.download_resource(resource, **kwargs)
+            data = hxl.data(path, InputOptions(allow_local=True)).cache()
+            data.display_tags
+            return data
+        except hxl.HXLException:
+            logger.warning(
+                f"Could not process {url}. Maybe there are no HXL tags?"
+            )
+            return None
+        except Exception:
+            logger.exception(f"Error reading {url}!")
+            raise
+    def hxl_info_file(
+        self, name: str, format: str, url: str, **kwargs: Any
+    ) -> Optional[Dict]:
+        """Get HXL info on file. The filename comes from the name and
+        format.
+        Args:
+            name (str): Name for the download
+            format (str): Format of download
+            url (str): URL of download
+            **kwargs (Any): Parameters to pass to download_file call
+        Returns:
+            Optional[Dict]: Information about file or None
+        """
+        try:
+            _, path = self.construct_filename_and_download(
+                name, format, url, **kwargs
+            )
+            return hxl.info(path, InputOptions(allow_local=True))
+        except hxl.HXLException:
+            logger.warning(
+                f"Could not process {url}. Maybe there are no HXL tags?"
+            )
+            return None
+        except Exception:
+            logger.exception(f"Error reading {url}!")
+            raise
+    def hxl_info_hapi_resource_metadata(
+        self,
+        hapi_resource_metadata: Dict,
+        **kwargs: Any,
+    ) -> Optional[Dict]:
+        """Get HXL info on HAPI resource. The filename comes from the name and
+        format.
+        Args:
+            hapi_resource_metadata (Dict): HAPI resource metadata
+            **kwargs (Any): Parameters to pass to download_file call
+        Returns:
+            Optional[Dict]: Information about file or None
+        """
+        name = hapi_resource_metadata["name"]
+        format = hapi_resource_metadata["format"]
+        url = hapi_resource_metadata["download_url"]
+        return self.hxl_info_file(name, format, url, **kwargs)
     def read_hdx_metadata(
         self, datasetinfo: Dict, do_resource_check: bool = True
     ) -> Optional[Resource]:
@@ -452,12 +543,23 @@ class Read(Retrieve):
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
-            **kwargs: Variables to use when evaluating template arguments
+            **kwargs: Parameters to pass to download_file call
         Returns:
             Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
         """
-        self.read_hdx_metadata(datasetinfo)
+        resource = self.read_hdx_metadata(datasetinfo)
+        filename = kwargs.get("filename")
+        if filename:
+            del kwargs["filename"]
+            datasetinfo["filename"] = filename
+        filename = datasetinfo.get("filename")
+        if resource and not filename:
+            # prefix is added later
+            filename = self.construct_filename(
+                resource["name"], resource.get_file_type()
+            )
+            datasetinfo["filename"] = filename
         return self.read_tabular(datasetinfo, **kwargs)
     def read(
@@ -469,7 +571,7 @@ class Read(Retrieve):
         Args:
             datasetinfo (Dict): Dictionary of information about dataset
-            **kwargs: Variables to use when evaluating template arguments in urls
+            **kwargs: Parameters to pass to download_file call
         Returns:
             Tuple[List[str],Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)

{hdx_python_scraper-2.2.3.dist-info → hdx_python_scraper-2.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: hdx-python-scraper
-Version: 2.2.3
+Version: 2.3.0
 Summary: HDX Python scraper utilities to assemble data from multiple sources
 Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
 Author-email: Michael Rans <rans@email.com>
@@ -26,12 +26,13 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Requires-Dist: gspread
-Requires-Dist: hdx-python-api>=6.1.3
+Requires-Dist: hdx-python-api>=6.1.4
+Requires-Dist: hdx-python-country>=3.6.3
 Requires-Dist: regex
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == 'dev'
 Provides-Extra: pandas
-Requires-Dist: pandas>=2.1.1; extra == 'pandas'
+Requires-Dist: pandas>=2.1.3; extra == 'pandas'
 Provides-Extra: test
 Requires-Dist: pytest; extra == 'test'
 Requires-Dist: pytest-cov; extra == 'test'
@@ -43,19 +44,19 @@ Description-Content-Type: text/markdown
 [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
 [![Downloads](https://img.shields.io/pypi/dm/hdx-python-scraper.svg)](https://pypistats.org/packages/hdx-python-scraper)
-The HDX Python Scraper Library is designed to enable you to easily develop code that
-assembles data from one or more tabular sources that can be csv, xls, xlsx or JSON. It
-uses a YAML file that specifies for each source what needs to be read and allows some
-transformations to be performed on the data. The output is written to JSON, Google sheets
-and/or Excel and includes the addition of
-[Humanitarian Exchange Language (HXL)](https://hxlstandard.org/) hashtags specified in
-the YAML file. Custom Python scrapers can also be written that conform to a defined
-specification and the framework handles the execution of both configurable and custom
+The HDX Python Scraper Library is designed to enable you to easily develop code that
+assembles data from one or more tabular sources that can be csv, xls, xlsx or JSON. It
+uses a YAML file that specifies for each source what needs to be read and allows some
+transformations to be performed on the data. The output is written to JSON, Google sheets
+and/or Excel and includes the addition of
+[Humanitarian Exchange Language (HXL)](https://hxlstandard.org/) hashtags specified in
+the YAML file. Custom Python scrapers can also be written that conform to a defined
+specification and the framework handles the execution of both configurable and custom
 scrapers.
-For more information, please read the
-[documentation](https://hdx-python-scraper.readthedocs.io/en/latest/).
+For more information, please read the
+[documentation](https://hdx-python-scraper.readthedocs.io/en/latest/).
-This library is part of the
-[Humanitarian Data Exchange](https://data.humdata.org/) (HDX) project. If you have
+This library is part of the
+[Humanitarian Data Exchange](https://data.humdata.org/) (HDX) project. If you have
 humanitarian related data, please upload your datasets to HDX.

{hdx_python_scraper-2.2.3.dist-info → hdx_python_scraper-2.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 hdx/scraper/__init__.py,sha256=11ozJKiUsqDCZ3_mcAHhGYUyGK_Unl54djVSBBExFB4,59
-hdx/scraper/_version.py,sha256=gF7zM8AsdLRqhgteXesNHb7_t8ukr2zzkok2g1nvvhA,411
-hdx/scraper/base_scraper.py,sha256=IaUDqnrSxB0kbEQynX-81NEyv9DLxypWKwEDAEr9GWg,14628
-hdx/scraper/runner.py,sha256=-7L-L9WGZdTGl5mWNAPgvpTreU9bvbdxklruGCRzjRs,51217
+hdx/scraper/_version.py,sha256=ChsIHG8bRc-eXUbXOgv4Fm4DstSKLq9FpsTAsaMeR08,411
+hdx/scraper/base_scraper.py,sha256=OZoC8X3woecKbMxTtjx_aRr027SeJCS2gbtyB20n31o,15079
+hdx/scraper/runner.py,sha256=fojFcfEh3mZXe1dY3Jpis22dr9Zc6VY-0XTMiabuXFE,51366
 hdx/scraper/configurable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdx/scraper/configurable/aggregator.py,sha256=xC7bOF-wrQ17LlvdjSZUnUGuZHlNMH5jlmLSgyz5pe0,14976
-hdx/scraper/configurable/resource_downloader.py,sha256=vK8zNFy7T_Rj1h8Tj676-3B2oYYXFNKsrM9dxz7RZC8,1537
+hdx/scraper/configurable/resource_downloader.py,sha256=lCIQpNZtcCTRc3z0FFM2_JxRtoua9GEq2XiKRZ9fqZk,1549
 hdx/scraper/configurable/rowparser.py,sha256=h7a0W2xvVJSAu94nS5CAXvZSZXdwZ-isFHHNaIce0gM,14635
-hdx/scraper/configurable/scraper.py,sha256=TyB7ipTzhVpOC3in0ZBIMwbcTAOR0Ul-W6Np85NnogI,20468
-hdx/scraper/configurable/timeseries.py,sha256=uhnENo7Wsy0-YVjglm7OQkXI72-te61DkepkihbQrP8,2982
+hdx/scraper/configurable/scraper.py,sha256=kBkS-bm4zIQ9jbzFcwVoAnyji_9PTV_KKrNJVLTuYa4,20498
+hdx/scraper/configurable/timeseries.py,sha256=lWoQJApml-onTN4l9YnTAYnhj5uuTc-Luk05DIT7O9k,3036
 hdx/scraper/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdx/scraper/outputs/base.py,sha256=UBVFPANdd7wawifbKkPQWKwVC-Tr7Jg5ax1eLTmWX3M,2566
 hdx/scraper/outputs/excelfile.py,sha256=bKBj1aYUJCIXhvpmGXAJ0FLoKwjnj-2E0LlR64RcFdY,2197
@@ -15,11 +15,11 @@ hdx/scraper/outputs/googlesheets.py,sha256=gPjzikxP4wmMBGL5LW50MXUcDq5nwCRMW74G1
 hdx/scraper/outputs/json.py,sha256=uw9_yAVpHVPWQ8LtMUZKTH88okyrHQs_SVjT6HJOxZ4,9498
 hdx/scraper/utilities/__init__.py,sha256=iBjD7bc8wEzQhwkcx2mOZwYmu28VHjl5px66quqWJ8E,2491
 hdx/scraper/utilities/fallbacks.py,sha256=08tvqVFuFV_gsvS7jqEiJUr7gqNILKCakDa8xMuIMpI,6186
-hdx/scraper/utilities/reader.py,sha256=AnIHB5fprVELjjZACmdX0GQK7iuLV6N8M6BfFvcmFU0,18338
+hdx/scraper/utilities/reader.py,sha256=9cXrk8_NrE4kHIm3wrM3KHgKX6bho_eCyibMDBairiU,21499
 hdx/scraper/utilities/region_lookup.py,sha256=VSfIoBGmhS0lNgwe4kKIhHqP7k0DlJYI2JDdABAAmoM,3917
 hdx/scraper/utilities/sources.py,sha256=h27PjBADqIhqDwmhzMXt1OjwJWZc2iVnIBwJuAJKHwo,11204
 hdx/scraper/utilities/writer.py,sha256=x-3xnOjvZEMUR2Op42eiBbaSmtNM6MY86adnL_Cob9s,16726
-hdx_python_scraper-2.2.3.dist-info/METADATA,sha256=eGWzjjm34Y1G3QIGP0I_Jo8Qk1aT09_KqP_xEi1blqw,3289
-hdx_python_scraper-2.2.3.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87
-hdx_python_scraper-2.2.3.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
-hdx_python_scraper-2.2.3.dist-info/RECORD,,
+hdx_python_scraper-2.3.0.dist-info/METADATA,sha256=E5b13txhk44RjnOSKJu_SkaypNFXxe5YDLUBCWKA7Pk,3318
+hdx_python_scraper-2.3.0.dist-info/WHEEL,sha256=0wCxn4rnLsvRWBK-NC7mK2QMIQ_aZSl7Qvk-8IWl_pY,87
+hdx_python_scraper-2.3.0.dist-info/licenses/LICENSE,sha256=wc-4GpMn-ODs-U_bTe1YCiPVgvcjzrpYOx2wPuyAeII,1079
+hdx_python_scraper-2.3.0.dist-info/RECORD,,

{hdx_python_scraper-2.2.3.dist-info → hdx_python_scraper-2.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.18.0
+Generator: hatchling 1.19.1
 Root-Is-Purelib: true
 Tag: py3-none-any

{hdx_python_scraper-2.2.3.dist-info → hdx_python_scraper-2.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

hdx-python-scraper 2.2.3__py3-none-any.whl → 2.3.0__py3-none-any.whl

hdx-python-scraper 2.2.3py3-none-any.whl → 2.3.0py3-none-any.whl