PyPI - python-documentcloud - Versions diffs - 4.4.1__tar.gz → 4.6.0__tar.gz - Mend

python-documentcloud 4.4.1tar.gz → 4.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-documentcloud
-Version: 4.4.1
+Version: 4.6.0
 Summary: A simple Python wrapper for the DocumentCloud API
 Home-page: https://github.com/muckrock/python-documentcloud
 Author: Mitchell Kotler

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/addon.py RENAMED Viewed

@@ -182,6 +182,26 @@ class AddOn(BaseAddOn):
             f"addon_runs/{self.id}/", json={"file_name": file_name}
         )
+    def load_run_data(self):
+        "Load persistent data from this run"
+        if not self.id:
+            return {}
+        response = self.client.get(f"addon_runs/{self.id}/")
+        response.raise_for_status()
+        return response.json().get("data", {})
+    def store_run_data(self, data):
+        "Store persistent data for this run"
+        if not self.id:
+            print("Run ID not set. Try again later or check if something went wrong.")
+            return None
+        if not isinstance(data, dict):
+            raise TypeError("Invalid data")
+        return self.client.patch(f"addon_runs/{self.id}/", json={"data": data})
     def load_event_data(self):
         """Load persistent data for this event"""
         if not self.event_id:

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/client.py RENAMED Viewed

@@ -14,11 +14,12 @@ from .users import UserClient
 logger = logging.getLogger("documentcloud")
 class DocumentCloud(SquareletClient):
     """
     The public interface for the DocumentCloud API, now integrated with SquareletClient
     """
-    # pylint:disable=too-many-positional-arguments
     def __init__(
         self,
         username=None,
@@ -30,7 +31,7 @@ class DocumentCloud(SquareletClient):
         rate_limit=True,
         rate_limit_sleep=True,
     ):
-       # Initialize SquareletClient for authentication and request handling
+        # Initialize SquareletClient for authentication and request handling
         super().__init__(
             base_uri=base_uri,
             username=username,
@@ -38,7 +39,7 @@ class DocumentCloud(SquareletClient):
             auth_uri=auth_uri,
             timeout=timeout,
             rate_limit=rate_limit,
-            rate_limit_sleep=rate_limit_sleep
+            rate_limit_sleep=rate_limit_sleep,
         )
         # Set up logging
@@ -55,8 +56,3 @@ class DocumentCloud(SquareletClient):
         self.projects = ProjectClient(self)
         self.users = UserClient(self)
         self.organizations = OrganizationClient(self)
-    """def _request(self, method, url, raise_error=True, **kwargs):
-        Delegates request to the SquareletClient's _request method
-        return self.squarelet_client.request(method, url, raise_error, **kwargs)
-        """

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/documentcloud/documents.py RENAMED Viewed

@@ -9,6 +9,7 @@ import os
 import re
 import warnings
 from functools import partial
+from urllib.parse import urlparse
 # Third Party
 from requests.exceptions import RequestException
@@ -23,11 +24,6 @@ from .sections import SectionClient
 from .toolbox import grouper, is_url, merge_dicts, requests_retry_session
 from .users import User
-try:
-    from urllib.parse import urlparse
-except ImportError:
-    from urlparse import urlparse
 logger = logging.getLogger("documentcloud")
 IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]
@@ -74,8 +70,11 @@ class Document(BaseAPIObject):
     def __getattr__(self, attr):
         """Generate methods for fetching resources"""
         p_image = re.compile(
-            r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
+            r"^get_"
+            r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
+            r"(?P<list>_list)?$"
         )
         get = attr.startswith("get_")
         url = attr.endswith("_url")
         text = attr.endswith("_text")
@@ -230,9 +229,15 @@ class Document(BaseAPIObject):
         return all_results
-    def process(self):
-        """Reprocess the document"""
-        self._client.post(f"{self.api_path}/{self.id}/process/")
+    def process(self, **kwargs):
+        """Process the document, used on upload and for reprocessing"""
+        payload = {}
+        if "force_ocr" in kwargs:
+            payload["force_ocr"] = kwargs["force_ocr"]
+        if "ocr_engine" in kwargs:
+            payload["ocr_engine"] = kwargs["ocr_engine"]
+        self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
 class DocumentClient(BaseAPIClient):
@@ -310,6 +315,7 @@ class DocumentClient(BaseAPIClient):
             "title",
             "data",
             "force_ocr",
+            "ocr_engine",
             "projects",
             "delayed_index",
             "revision_control",
@@ -333,21 +339,55 @@ class DocumentClient(BaseAPIClient):
         return params
+    def _extract_ocr_options(self, kwargs):
+        """
+        Extract and validate OCR options from kwargs.
+        Returns:
+            force_ocr (bool)
+            ocr_engine (str)
+        """
+        force_ocr = kwargs.pop("force_ocr", False)
+        ocr_engine = kwargs.pop("ocr_engine", "tess4")
+        if not isinstance(force_ocr, bool):
+            raise ValueError("force_ocr must be a boolean")
+        if ocr_engine and ocr_engine not in ("tess4", "textract"):
+            raise ValueError(
+                "ocr_engine must be either 'tess4' for tesseract or 'textract'"
+            )
+        return force_ocr, ocr_engine
     def _get_title(self, name):
         """Get the default title for a document from its path"""
         return name.split(os.sep)[-1].rsplit(".", 1)[0]
     def _upload_url(self, file_url, **kwargs):
         """Upload a document from a publicly accessible URL"""
+        # extract process-related args
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
+        # create the document
         params = self._format_upload_parameters(file_url, **kwargs)
         params["file_url"] = file_url
+        if force_ocr:
+            params["force_ocr"] = force_ocr
+            params["ocr_engine"] = ocr_engine
         response = self.client.post("documents/", json=params)
-        return Document(self.client, response.json())
+        create_json = response.json()
+        # wrap in Document object
+        doc = Document(self.client, create_json)
+        return doc
     def _upload_file(self, file_, **kwargs):
         """Upload a document directly"""
         # create the document
-        force_ocr = kwargs.pop("force_ocr", False)
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
         params = self._format_upload_parameters(file_.name, **kwargs)
         response = self.client.post("documents/", json=params)
@@ -357,12 +397,12 @@ class DocumentClient(BaseAPIClient):
         response = requests_retry_session().put(presigned_url, data=file_.read())
         # begin processing the document
-        doc_id = create_json["id"]
-        response = self.client.post(
-            f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
-        )
+        doc = Document(self.client, create_json)
+        # begin processing
+        doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
-        return Document(self.client, create_json)
+        return doc
     def _collect_files(self, path, extensions):
         """Find the paths to files with specified extensions under a directory"""
@@ -379,171 +419,98 @@ class DocumentClient(BaseAPIClient):
     def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
         """Upload files with specified extensions in a directory"""
-        # pylint: disable=too-many-locals, too-many-branches
-        # Do not set the same title for all documents
+        # pylint:disable=too-many-locals
         kwargs.pop("title", None)
-        # If extensions are specified as None, it will check for all supported
-        # filetypes.
         if extensions is None:
             extensions = SUPPORTED_EXTENSIONS
-        # Convert single extension to a list if provided
         if extensions and not isinstance(extensions, list):
             extensions = [extensions]
-        # Checks to see if the extensions are supported, raises an error if not.
         invalid_extensions = set(extensions) - set(SUPPORTED_EXTENSIONS)
         if invalid_extensions:
             raise ValueError(
                 f"Invalid extensions provided: {', '.join(invalid_extensions)}"
             )
-        # Loop through the path and get all the files with matching extensions
         path_list = self._collect_files(path, extensions)
         logger.info(
-            "Upload directory on %s: Found %d files to upload",
-            path,
-            len(path_list)
+            "Upload directory on %s: Found %d files to upload", path, len(path_list)
         )
-        # Upload all the files using the bulk API to reduce the number
-        # of API calls and improve performance
         obj_list = []
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
         params = self._format_upload_parameters("", **kwargs)
         for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
-            # Grouper will put None's on the end of the last group
             file_paths = [p for p in file_paths if p is not None]
             logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
-            # Create the documents
-            logger.info("Creating the documents...")
-            try:
-                response = self.client.post(
-                    "documents/",
-                    json=[
-                        merge_dicts(
-                            params,
-                            {
-                                "title": self._get_title(p),
-                                "original_extension": os.path.splitext(
-                                    os.path.basename(p)
-                                )[1]
-                                .lower()
-                                .lstrip("."),
-                            },
-                        )
-                        for p in file_paths
-                    ],
-                )
-            except (APIError, RequestException) as exc:
-                if handle_errors:
-                    logger.info(
-                        "Error creating the following documents: %s\n%s",
-                        exc,
-                        "\n".join(file_paths)
-                    )
-                    continue
-                else:
-                    raise
+            create_json = self._create_documents(file_paths, params, handle_errors)
+            sorted_create_json = sorted(create_json, key=lambda j: j["title"])
+            sorted_file_paths = sorted(file_paths, key=self._get_title)
+            obj_list.extend(sorted_create_json)
+            presigned_urls = [j["presigned_url"] for j in sorted_create_json]
-            # Upload the files directly to storage
-            create_json = response.json()
-            obj_list.extend(create_json)
-            presigned_urls = [j["presigned_url"] for j in create_json]
-            for url, file_path in zip(presigned_urls, file_paths):
-                logger.info("Uploading %s to S3...", file_path)
-                try:
-                    with open(file_path, "rb") as file:
-                        response = requests_retry_session().put(url, data=file.read())
-                    self.client.raise_for_status(response)
-                except (APIError, RequestException) as exc:
-                    if handle_errors:
-                        logger.info(
-                            "Error uploading the following document: %s %s",
-                            exc,
-                            file_path
-                        )
-                        continue
-                    else:
-                        raise
-            # Begin processing the documents
-            logger.info("Processing the documents...")
-            doc_ids = [j["id"] for j in create_json]
-            try:
-                response = self.client.post("documents/process/", json={"ids": doc_ids})
-            except (APIError, RequestException) as exc:
-                if handle_errors:
-                    logger.info(
-                        "Error creating the following documents: %s\n%s",
-                        exc,
-                        "\n".join(file_paths)
-                    )
-                    continue
-                else:
-                    raise
+            self._upload_files_to_s3(sorted_file_paths, presigned_urls, handle_errors)
+            self._process_documents(create_json, force_ocr, ocr_engine, handle_errors)
         logger.info("Upload directory complete")
-        # Pass back the list of documents
         return [Document(self.client, d) for d in obj_list]
-    def upload_urls(self, url_list, handle_errors=False, **kwargs):
-        """Upload documents from a list of URLs"""
-        # Do not set the same title for all documents
-        kwargs.pop("title", None)
-        obj_list = []
-        params = self._format_upload_parameters("", **kwargs)
-        for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
-            # Grouper will put None's on the end of the last group
-            url_group = [url for url in url_group if url is not None]
-            logger.info(
-                "Uploading group %d: %s",
-                i + 1,
-                "\n".join(url_group)
+    def _create_documents(self, file_paths, params, handle_errors):
+        body = [
+            merge_dicts(
+                params,
+                {
+                    "title": self._get_title(p),
+                    "original_extension": os.path.splitext(os.path.basename(p))[1]
+                    .lower()
+                    .lstrip("."),
+                },
             )
-            # Create the documents
-            logger.info("Creating the documents...")
-            try:
-                response = self.client.post(
-                    "documents/",
-                    json=[
-                        merge_dicts(
-                            params,
-                            {
-                                "title": self._get_title(url),
-                                "file_url": url,
-                            },
-                        )
-                        for url in url_group
-                    ],
+            for p in sorted(file_paths)
+        ]
+        try:
+            response = self.client.post("documents/", json=body)
+        except (APIError, RequestException) as exc:
+            if handle_errors:
+                logger.info(
+                    "Error creating the following documents: %s\n%s",
+                    exc,
+                    "\n".join(file_paths),
                 )
+                return []
+            else:
+                raise
+        return response.json()
+    def _upload_files_to_s3(self, file_paths, presigned_urls, handle_errors):
+        for url, file_path in zip(presigned_urls, file_paths):
+            logger.info("Uploading %s to S3...", file_path)
+            try:
+                with open(file_path, "rb") as f:
+                    response = requests_retry_session().put(url, data=f.read())
+                self.client.raise_for_status(response)
             except (APIError, RequestException) as exc:
                 if handle_errors:
                     logger.info(
-                        "Error creating the following documents: %s\n%s",
-                        str(exc),
-                        "\n".join(url_group)
+                        "Error uploading the following document: %s %s", exc, file_path
                     )
-                    continue
                 else:
                     raise
-            create_json = response.json()
-            obj_list.extend(create_json)
-        logger.info("Upload URLs complete")
-        # Pass back the list of documents
-        return [Document(self.client, d) for d in obj_list]
+    def _process_documents(self, create_json, force_ocr, ocr_engine, handle_errors):
+        payload = [
+            {"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
+            for j in create_json
+        ]
+        try:
+            self.client.post("documents/process/", json=payload)
+        except (APIError, RequestException) as exc:
+            if handle_errors:
+                logger.info("Error processing documents: %s", exc)
+            else:
+                raise
 class Mention:

python_documentcloud-4.6.0/documentcloud/exceptions.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Custom exceptions for python-documentcloud
+"""
+# Third Party
+# pylint: disable=unused-import
+# Import exceptions from python-squarelet
+from squarelet.exceptions import (
+    APIError,
+    CredentialsFailedError,
+    DoesNotExistError,
+    DuplicateObjectError,
+    MultipleObjectsReturnedError,
+    SquareletError as DocumentCloudError,
+)

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-documentcloud
-Version: 4.4.1
+Version: 4.6.0
 Summary: A simple Python wrapper for the DocumentCloud API
 Home-page: https://github.com/muckrock/python-documentcloud
 Author: Mitchell Kotler

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/python_documentcloud.egg-info/SOURCES.txt RENAMED Viewed

@@ -20,6 +20,7 @@ python_documentcloud.egg-info/SOURCES.txt
 python_documentcloud.egg-info/dependency_links.txt
 python_documentcloud.egg-info/requires.txt
 python_documentcloud.egg-info/top_level.txt
+tests/test_addon.py
 tests/test_annotations.py
 tests/test_base.py
 tests/test_client.py

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/setup.py RENAMED Viewed

@@ -7,7 +7,7 @@ with open("README.md", "r") as fh:
 setup(
     name="python-documentcloud",
-    version="4.4.1",
+    version="4.6.0",
     description="A simple Python wrapper for the DocumentCloud API",
     author="Mitchell Kotler",
     author_email="mitch@muckrock.com",

python_documentcloud-4.6.0/tests/test_addon.py ADDED Viewed

@@ -0,0 +1,141 @@
+# Standard Library
+from unittest.mock import MagicMock
+# Third Party
+import pytest
+# DocumentCloud
+from documentcloud.addon import AddOn
+# pylint: disable=redefined-outer-name
+@pytest.fixture
+def addon():
+    """An AddOn instance built without invoking argparse or constructing a real client.
+    Tests can override `.id`, `.event_id`, `.client`, etc. as needed.
+    """
+    instance = AddOn.__new__(AddOn)
+    instance.id = "run-123"
+    instance.addon_id = "addon-1"
+    instance.event_id = None
+    instance.documents = None
+    instance.query = None
+    instance.user_id = None
+    instance.org_id = None
+    instance.data = {}
+    instance.title = "Test AddOn"
+    instance.client = MagicMock()
+    return instance
+class TestLoadRunData:
+    def test_returns_data_when_run_id_set(self, addon):
+        addon.client.get.return_value.json.return_value = {"data": {"foo": "bar"}}
+        result = addon.load_run_data()
+        addon.client.get.assert_called_once_with("addon_runs/run-123/")
+        assert result == {"foo": "bar"}
+    def test_returns_empty_dict_when_no_run_id(self, addon):
+        addon.id = None
+        assert addon.load_run_data() == {}
+        addon.client.get.assert_not_called()
+    def test_returns_empty_dict_when_data_missing_from_response(self, addon):
+        addon.client.get.return_value.json.return_value = {}
+        assert addon.load_run_data() == {}
+class TestStoreRunData:
+    def test_patches_run_with_data(self, addon):
+        addon.store_run_data({"foo": "bar"})
+        addon.client.patch.assert_called_once_with(
+            "addon_runs/run-123/", json={"data": {"foo": "bar"}}
+        )
+    def test_no_op_when_no_run_id(self, addon, capsys):
+        addon.id = None
+        result = addon.store_run_data({"foo": "bar"})
+        assert result is None
+        addon.client.patch.assert_not_called()
+        assert "Run ID not set" in capsys.readouterr().out
+    def test_rejects_non_dict_data(self, addon):
+        with pytest.raises(TypeError):
+            addon.store_run_data("not a dict")
+        addon.client.patch.assert_not_called()
+class TestLoadEventData:
+    def test_returns_scratch_when_event_id_set(self, addon):
+        addon.event_id = "evt-9"
+        addon.client.get.return_value.json.return_value = {"scratch": {"x": 1}}
+        result = addon.load_event_data()
+        addon.client.get.assert_called_once_with("addon_events/evt-9/")
+        assert result == {"x": 1}
+    def test_returns_none_when_no_event_id(self, addon):
+        assert addon.load_event_data() is None
+        addon.client.get.assert_not_called()
+class TestStoreEventData:
+    def test_patches_event_with_scratch(self, addon):
+        addon.event_id = "evt-9"
+        addon.store_event_data({"x": 1})
+        addon.client.patch.assert_called_once_with(
+            "addon_events/evt-9/", json={"scratch": {"x": 1}}
+        )
+    def test_no_op_when_no_event_id(self, addon):
+        assert addon.store_event_data({"x": 1}) is None
+        addon.client.patch.assert_not_called()
+@pytest.fixture
+def real_addon(client, addon_run):
+    """An AddOn wired to the real `client` fixture and a freshly created run."""
+    instance = AddOn.__new__(AddOn)
+    instance.id = addon_run
+    instance.addon_id = None
+    instance.event_id = None
+    instance.documents = None
+    instance.query = None
+    instance.user_id = None
+    instance.org_id = None
+    instance.data = {}
+    instance.title = "Test AddOn"
+    instance.client = client
+    return instance
+class TestRunDataVCR:
+    """VCR-recorded round-trip tests against the dev DC.
+    Recording: set DC_TEST_ADDON_RUN_ID to an existing AddOnRun UUID on your
+    local dev DC, then run `make test-dev` (or `pytest --record-mode=new_episodes`).
+    """
+    def test_load_run_data_returns_dict(self, real_addon):
+        result = real_addon.load_run_data()
+        assert isinstance(result, dict)
+    def test_store_then_load_run_data_round_trip(self, real_addon):
+        payload = {"foo": "bar", "n": 42}
+        real_addon.store_run_data(payload)
+        loaded = real_addon.load_run_data()
+        assert loaded.get("foo") == "bar"
+        assert loaded.get("n") == 42

{python_documentcloud-4.4.1 → python_documentcloud-4.6.0}/tests/test_documents.py RENAMED Viewed

@@ -158,9 +158,7 @@ class TestDocument:
 class TestDocumentClient:
     def test_search(self, client, document):
-        documents = client.documents.search(
-            f"document:{document.id} simple"
-        )
+        documents = client.documents.search(f"document:{document.id} simple")
         assert documents
     def test_list(self, client):
@@ -182,7 +180,6 @@ class TestDocumentClient:
             document = document_factory(pdf)
         assert document.status == "success"
     def test_upload_file_path(self, document_factory):
         document = document_factory("tests/test.pdf")
         assert document.status == "success"

python_documentcloud-4.4.1/documentcloud/exceptions.py DELETED Viewed

@@ -1,12 +0,0 @@
-"""
-Custom exceptions for python-documentcloud
-"""
-# pylint: disable=unused-import
-# Import exceptions from python-squarelet
-from squarelet.exceptions import SquareletError as DocumentCloudError
-from squarelet.exceptions import DuplicateObjectError
-from squarelet.exceptions import CredentialsFailedError
-from squarelet.exceptions import APIError
-from squarelet.exceptions import DoesNotExistError
-from squarelet.exceptions import MultipleObjectsReturnedError