PyPI - hdx-python-scraper - Versions diffs - 2.7.0__tar.gz → 2.7.2__tar.gz - Mend

hdx-python-scraper 2.7.0tar.gz → 2.7.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

hdx_python_scraper-2.7.2/.github/workflows/publish.yaml ADDED Viewed

@@ -0,0 +1,34 @@
+name: Publish to PyPI
+on:
+  release:
+    types: [published]
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/hdx-python-scraper
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+    - uses: actions/checkout@v6
+    - name: Get history and tags for versioning to work
+      run: |
+        git fetch --prune --unshallow
+        git fetch --depth=1 origin +refs/tags/*:refs/tags/*
+    - name: Install uv
+      uses: astral-sh/setup-uv@v7
+    - name: Build with uv
+      run: uv build
+    - name: Publish distribution 📦 to PyPI
+      run: uv publish

hdx_python_scraper-2.7.2/.github/workflows/run-python-tests.yaml ADDED Viewed

@@ -0,0 +1,52 @@
+name: Run tests
+on:
+  workflow_dispatch:
+  push:
+    branches-ignore: [gh-pages, "dependabot/**"]
+  pull_request:
+    branches-ignore: [gh-pages]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      checks: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v6
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          python-version: "3.13"
+      - name: Install dependencies
+        run: uv sync --frozen
+      - name: Check styling
+        run: |
+          uv run ruff format --check
+          uv run ruff check
+      - name: Test with pytest
+        env:
+          HDX_KEY_TEST: ${{ secrets.HDX_BOT_SCRAPERS_API_TOKEN }}
+          GSHEET_AUTH: ${{ secrets.HDX_PIPELINE_GSHEET_AUTH }}
+        run: uv run pytest
+      - name: Publish Unit Test Results
+        uses: EnricoMi/publish-unit-test-result-action@v2
+        if: always()
+        with:
+          files: test-results.xml
+      - name: Publish in Coveralls
+        uses: coverallsapp/github-action@v2
+        if: always()
+        with:
+          flag-name: tests
+          format: lcov

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/.pre-commit-config.yaml RENAMED Viewed

@@ -1,5 +1,6 @@
 default_language_version:
-    python: python3.13
+  python: python3.13
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v6.0.0
@@ -8,20 +9,18 @@ repos:
       - id: end-of-file-fixer
         exclude: test_scraper_.*\.json
       - id: check-ast
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.10
+    rev: v0.14.14
     hooks:
       # Run the linter.
       - id: ruff-check
         args: [ --fix ]
       # Run the formatter.
       - id: ruff-format
   - repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.9.22
+    rev: 0.9.25
     hooks:
-      # Run the pip compile
-      - id: pip-compile
-        name: pip-compile requirements.txt
-        files: pyproject.toml
-        args: [ pyproject.toml, --resolver=backtracking, --upgrade, -q,
-                -o, requirements.txt ]
+      # Ensure the lockfile is up-to-date with pyproject.toml
+      - id: uv-lock

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hdx-python-scraper
-Version: 2.7.0
+Version: 2.7.2
 Summary: HDX Python scraper utilities to assemble data from multiple sources
 Project-URL: Homepage, https://github.com/OCHA-DAP/hdx-python-scraper
 Author-email: Michael Rans <rans@email.com>
@@ -26,20 +26,15 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
 Requires-Dist: gspread
-Requires-Dist: hdx-python-api>=6.6.0
-Requires-Dist: hdx-python-country>=4.0.0
-Requires-Dist: hdx-python-utilities>=4.0.0
+Requires-Dist: hdx-python-api>=6.6.4
+Requires-Dist: hdx-python-country>=4.1.1
+Requires-Dist: hdx-python-utilities>=4.0.4
+Requires-Dist: libhxl
 Requires-Dist: regex
-Provides-Extra: dev
-Requires-Dist: pre-commit; extra == 'dev'
 Provides-Extra: docs
 Requires-Dist: mkapi; extra == 'docs'
 Provides-Extra: pandas
 Requires-Dist: pandas>=2.2.3; extra == 'pandas'
-Provides-Extra: test
-Requires-Dist: pandas>=2.2.3; extra == 'test'
-Requires-Dist: pytest; extra == 'test'
-Requires-Dist: pytest-cov; extra == 'test'
 Description-Content-Type: text/markdown
 [![Build Status](https://github.com/OCHA-DAP/hdx-python-scraper/actions/workflows/run-python-tests.yaml/badge.svg)](https://github.com/OCHA-DAP/hdx-python-scraper/actions/workflows/run-python-tests.yaml)

hdx_python_scraper-2.7.2/pyproject.toml ADDED Viewed

@@ -0,0 +1,150 @@
+#########################
+# Project Configuration #
+#########################
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+[project]
+name = "hdx-python-scraper"
+description = "HDX Python scraper utilities to assemble data from multiple sources"
+authors = [{name = "Michael Rans", email = "rans@email.com"}]
+license = {text = "MIT"}
+keywords = ["HDX", "scrapers", "data assembly", "data transformation", "tabular data"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Natural Language :: English",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: Unix",
+    "Operating System :: MacOS",
+    "Operating System :: Microsoft :: Windows",
+]
+readme = "README.md"
+dynamic = ["version"]
+requires-python = ">=3.10"
+dependencies = [
+    "hdx-python-api>=6.6.4",
+    "hdx-python-country>=4.1.1",
+    "hdx-python-utilities>=4.0.4",
+    "libhxl",
+    "gspread",
+    "regex",
+]
+[project.optional-dependencies]
+pandas = ["pandas>=2.2.3"]
+docs = ["mkapi"]
+[dependency-groups]
+dev = [
+    "pandas>=2.2.3",
+    "pytest",
+    "pytest-cov",
+    "pre-commit",
+    "ruff==0.14.14",
+]
+[project.urls]
+Homepage = "https://github.com/OCHA-DAP/hdx-python-scraper"
+# ----------------------------------------------------------------------------
+# Hatchling (Build & Versioning)
+# ----------------------------------------------------------------------------
+[tool.hatch.version]
+source = "vcs"
+[tool.hatch.version.raw-options]
+local_scheme = "no-local-version"
+version_scheme = "python-simplified-semver"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/hdx/scraper/framework/_version.py"
+[tool.hatch.build.targets.wheel]
+packages = ["src/hdx"]
+[tool.hatch.metadata]
+allow-direct-references = true
+# ----------------------------------------------------------------------------
+# Ruff (Linting & Formatting)
+# ----------------------------------------------------------------------------
+[tool.ruff]
+target-version = "py310"
+src = ["src"]
+exclude = ["_version.py"]
+[tool.ruff.lint]
+# Defaults are E (pycodestyle) and F (pyflakes). We extend them:
+extend-select = [
+  "I",   # isort
+  "UP",  # pyupgrade
+]
+ignore = [
+    "E501", # Line too long
+]
+[tool.ruff.lint.isort]
+known-local-folder = ["hdx.scraper.framework"]
+known-third-party = [
+    "hdx.api",
+    "hdx.data",
+    "hdx.facades",
+    "hdx.location",
+    "hdx.utilities",
+]
+# ----------------------------------------------------------------------------
+# Pytest (Testing)
+# ----------------------------------------------------------------------------
+[tool.pytest.ini_options]
+pythonpath = "src"
+log_cli = true
+addopts = """
+    --color=yes
+    --rootdir=.
+    --junitxml=test-results.xml
+    --cov
+    --no-cov-on-fail
+    --cov-report=lcov
+    --cov-report=term-missing
+"""
+# ----------------------------------------------------------------------------
+# Coverage (Reporting)
+# ----------------------------------------------------------------------------
+[tool.coverage.run]
+source = ["src"]
+omit = ["*/_version.py"]
+[tool.coverage.report]
+exclude_also = [
+    "from ._version",
+    "def __repr__",
+    "if self.debug:",
+    "if settings.DEBUG",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if 0:",
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+    "@(abc\\.)?abstractmethod",
+]

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '2.7.0'
-__version_tuple__ = version_tuple = (2, 7, 0)
+__version__ = version = '2.7.2'
+__version_tuple__ = version_tuple = (2, 7, 2)
 __commit_id__ = commit_id = None

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/outputs/excelfile.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from openpyxl import Workbook
@@ -24,7 +25,7 @@ class ExcelFile(BaseOutput):
     """
     def __init__(
-        self, excel_path: str, tabs: dict[str, str], updatetabs: list[str]
+        self, excel_path: Path | str, tabs: dict[str, str], updatetabs: list[str]
     ) -> None:
         super().__init__(updatetabs)
         self.workbook = Workbook()

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/outputs/json.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import logging
-from os.path import join
+from pathlib import Path
 from typing import Any
 from hdx.utilities.dictandlist import dict_of_lists_add
@@ -184,7 +184,7 @@ class JsonFile(BaseOutput):
                         newrow[hxl_row[key]] = row[key]
                 self.add_data_row(name, newrow)
-    def save(self, folder: str | None = None, **kwargs: Any) -> list[str]:
+    def save(self, folder: Path | str | None = None, **kwargs: Any) -> list[Path]:
         """Save JSON file and any addition subsets of that JSON defined in the additional configuration
         Args:
@@ -197,7 +197,8 @@ class JsonFile(BaseOutput):
         filepaths = []
         filepath = self.configuration["output"]
         if folder:
-            filepath = join(folder, filepath)
+            folder = Path(folder)
+            filepath = folder / filepath
         logger.info(f"Writing JSON to {filepath}")
         save_json(self.json, filepath)
         filepaths.append(filepath)
@@ -262,7 +263,7 @@ class JsonFile(BaseOutput):
                 continue
             filedetailspath = filedetails["filepath"]
             if folder:
-                filedetailspath = join(folder, filedetailspath)
+                filedetailspath = folder / filedetailspath
             logger.info(f"Writing JSON to {filedetailspath}")
             save_json(json, filedetailspath)
             filepaths.append(filedetailspath)

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/runner.py RENAMED Viewed

@@ -2,6 +2,7 @@ import logging
 from collections.abc import Callable, Sequence
 from copy import copy
 from datetime import datetime
+from pathlib import Path
 from traceback import format_exc
 from typing import Any, Optional
@@ -422,7 +423,7 @@ class Runner:
     def add_resource_downloader(
         self,
         datasetinfo: dict,
-        folder: str = "",
+        folder: Path | str = "",
         force_add_to_run: bool = False,
     ) -> str:
         """Add resource downloader to the run. If running specific scrapers rather than
@@ -445,7 +446,7 @@ class Runner:
     def add_resource_downloaders(
         self,
         configuration: dict,
-        folder: str = "",
+        folder: Path | str = "",
         force_add_to_run: bool = False,
     ) -> list[str]:
         """Add multiple resource downloaders to the run. If running specific scrapers

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/scrapers/resource_downloader.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import logging
-from os.path import join
+from pathlib import Path
 from shutil import copy2
 from slugify import slugify
@@ -19,11 +19,11 @@ class ResourceDownloader(BaseScraper):
         folder: Folder to which to download. Default is "".
     """
-    def __init__(self, datasetinfo, folder):
+    def __init__(self, datasetinfo: dict, folder: Path | str):
         # ResourceDownloader only outputs to sources
         name = f"resource_downloader_{slugify(datasetinfo['hxltag'].lower(), separator='_')}"
         super().__init__(name, datasetinfo, {})
-        self.folder = folder
+        self.folder = Path(folder)
     def run(self) -> None:
         """Runs one resource downloader given dataset information
@@ -35,7 +35,7 @@ class ResourceDownloader(BaseScraper):
         resource = reader.read_hdx_metadata(self.datasetinfo)
         url, path = reader.download_resource(resource, file_prefix=self.name)
         logger.info(f"Downloading {url} to {path}")
-        copy2(path, join(self.folder, self.datasetinfo["filename"]))
+        copy2(path, self.folder / self.datasetinfo["filename"])
     def add_sources(self) -> None:
         """Add source for resource download

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/utilities/fallbacks.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from hdx.utilities.loader import LoadError, load_json
@@ -25,7 +26,7 @@ class Fallbacks:
     @classmethod
     def add(
         cls,
-        fallbacks_path: str,
+        fallbacks_path: Path | str,
         levels_mapping: dict[str, str] = default_levels_mapping,
         sources_key: str = "sources",
         admin_name_mapping: dict[str, str] = default_admin_name_mapping,

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/utilities/lookup.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import logging
 from copy import copy
+from pathlib import Path
 from hdx.utilities.loader import load_yaml
 from hdx.utilities.matching import get_code_from_name
@@ -20,7 +21,7 @@ class Lookup:
         classobject: Child class
     """
-    def __init__(self, yaml_config_path: str, classobject: type):
+    def __init__(self, yaml_config_path: Path | str, classobject: type):
         configuration = load_yaml(script_dir_plus_file(yaml_config_path, classobject))
         self._configuration = configuration
         initial_lookup = configuration.get("initial_lookup", {})

{hdx_python_scraper-2.7.0 → hdx_python_scraper-2.7.2}/src/hdx/scraper/framework/utilities/reader.py RENAMED Viewed

@@ -2,7 +2,7 @@ import glob
 import logging
 from collections.abc import Iterator, Sequence
 from datetime import datetime
-from os.path import join
+from pathlib import Path
 from typing import Any
 from urllib.parse import parse_qsl
@@ -41,9 +41,9 @@ class Read(Retrieve):
     def __init__(
         self,
         downloader: Download,
-        fallback_dir: str,
-        saved_dir: str,
-        temp_dir: str,
+        fallback_dir: Path | str,
+        saved_dir: Path | str,
+        temp_dir: Path | str,
         save: bool = False,
         use_saved: bool = False,
         prefix: str = "",
@@ -65,9 +65,9 @@ class Read(Retrieve):
     @classmethod
     def create_readers(
         cls,
-        fallback_dir: str,
-        saved_dir: str,
-        temp_dir: str,
+        fallback_dir: Path | str,
+        saved_dir: Path | str,
+        temp_dir: Path | str,
         save: bool = False,
         use_saved: bool = False,
         ignore: Sequence[str] = tuple(),
@@ -275,7 +275,7 @@ class Read(Retrieve):
         Returns:
             The dataset that was read or None
         """
-        saved_path = join(self.saved_dir, f"{dataset_name}.json")
+        saved_path = self.saved_dir / f"{dataset_name}.json"
         if self.use_saved:
             logger.info(f"Using saved dataset {dataset_name} in {saved_path}")
             dataset = Dataset.load_from_json(saved_path)
@@ -319,7 +319,7 @@ class Read(Retrieve):
             list of datasets resulting from query
         """
-        saved_path = join(self.saved_dir, filename)
+        saved_path = self.saved_dir / filename
         if self.use_saved:
             logger.info(
                 f"Using saved datasets in {filename}_n.json in {self.saved_dir}"
@@ -461,7 +461,7 @@ class Read(Retrieve):
         url = resource["url"]
         try:
             _, path = self.download_resource(resource, **kwargs)
-            data = hxl.data(path, InputOptions(allow_local=True)).cache()
+            data = hxl.data(str(path), InputOptions(allow_local=True)).cache()
             data.display_tags
             return data
         except hxl.HXLException:
@@ -488,7 +488,7 @@ class Read(Retrieve):
         """
         try:
             _, path = self.construct_filename_and_download(name, format, url, **kwargs)
-            return hxl.info(path, InputOptions(allow_local=True))
+            return hxl.info(str(path), InputOptions(allow_local=True))
         except hxl.HXLException:
             logger.warning(f"Could not process {url}. Maybe there are no HXL tags?")
             return None

hdx-python-scraper 2.7.0__tar.gz → 2.7.2__tar.gz

hdx-python-scraper 2.7.0tar.gz → 2.7.2tar.gz