PyPI - rara-tools - Versions diffs - 0.0.4__tar.gz → 0.0.8__tar.gz - Mend

rara-tools 0.0.4tar.gz → 0.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (27) hide show

{rara_tools-0.0.4/rara_tools.egg-info → rara_tools-0.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rara-tools
-Version: 0.0.4
+Version: 0.0.8
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara_tools-0.0.8/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.8

{rara_tools-0.0.4 → rara_tools-0.0.8}/pyproject.toml RENAMED Viewed

@@ -9,6 +9,7 @@ dependencies = { file = ["requirements.txt"] }
 [tool.setuptools.packages.find]
 include = [
     "rara_tools",
+    "rara_tools.constants"
 ]
 [project]

rara_tools-0.0.8/rara_tools/constants/__init__.py ADDED Viewed

File without changes

rara_tools-0.0.8/rara_tools/constants/digitizer.py ADDED Viewed

@@ -0,0 +1,13 @@
+class StatusKeys:
+    CLEAN_UP = "digitizer_clean_up"
+    ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
+    UPLOAD = "s3_upload"
+    DOWNLOAD = "digitizer_s3_download"
+    OCR = "digitizer_ocr"
+class Queue:
+    IO = "io"
+    DOWNLOAD = "download"
+    FINISH = "finish"
+    OCR = "ocr"

rara_tools-0.0.8/rara_tools/constants/general.py ADDED Viewed

@@ -0,0 +1,10 @@
+class Status:
+    FAILED = "FAILED"
+    PENDING = "PENDING"
+    RUNNING = "RUNNING"
+    COMPLETED = "COMPLETED"
+    RETRYING = "RETRYING"
+class Queue:
+    CORE = "core"

rara_tools-0.0.8/rara_tools/elastic.py ADDED Viewed

@@ -0,0 +1,175 @@
+from typing import Dict, Optional, Any, Iterator
+import elasticsearch_dsl
+from elastic_transport import ObjectApiResponse
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+from elasticsearch_dsl import Index
+from .decorators import _elastic_connection
+class KataElastic:
+    """A class to manage all required Elasticsearch operations for Kata.
+    """
+    TYPE_MAPPING = {
+        "keyword": elasticsearch_dsl.Keyword,
+        "text": elasticsearch_dsl.Text,
+        "float": elasticsearch_dsl.Float,
+        "integer": elasticsearch_dsl.Integer,
+        "date": elasticsearch_dsl.Date,
+    }
+    DEFAULT_MAPPING = {
+        "text": "keyword",
+        "parent_id": "keyword",
+        "text_quality": "float",
+        "n_chars": "integer",
+        "n_words": "integer",
+        "language": "keyword",
+        "end_page": "integer",
+        "start_page": "integer",
+        "sequence_nr": "integer",
+        "section_title": "keyword",
+        "section_type": "keyword",
+        "section_meta": "keyword",
+    }
+    def __init__(self, elasticsearch_url: str, timeout: Optional[int] = None):
+        self.timeout = timeout
+        self.elasticsearch_url = elasticsearch_url
+        self.elasticsearch = Elasticsearch(self.elasticsearch_url, request_timeout=self.timeout)
+    def _produce_rollover_index(self, index_prefix: str, rollover_limit: int) -> str:
+        indices = self.elasticsearch.indices.get(index=f"{index_prefix}-*", expand_wildcards="open")
+        sorted_indices = sorted([(k, v["settings"]["index"]["creation_date"]) for k, v in indices.items()], key=lambda x: x[1], reverse=True)
+        sorted_indices = [i[0] for i in sorted_indices]
+        # new index name if none exist
+        if not len(sorted_indices):
+            last_index_name = f"{index_prefix}-0"
+            last_index_count = 0
+        else:
+            last_index_name = sorted_indices[0]
+            last_index_count = self.elasticsearch.count(index=last_index_name)["count"]
+        # check the size of the last index of the pipeline
+        if last_index_count >= rollover_limit:
+            new_index_number = int(last_index_name[-1]) + 1
+            last_index_name = f"{index_prefix}-{new_index_number}"
+        return last_index_name
+    @_elastic_connection
+    def check(self) -> bool:
+        """Checks Elasticsearch connection.
+        :return: bool: Elasticsearch alive or dead.
+        """
+        if self.elasticsearch.ping():
+            return True
+        return False
+    def generate_mapping(self, schema: dict | None = None) -> dict:
+        mapping_dsl = elasticsearch_dsl.Mapping()
+        mapping = schema or self.DEFAULT_MAPPING
+        for field_name, field_type in mapping.items():
+            if field_type in self.TYPE_MAPPING:
+                # We instantiate the class stored in the type mapping.
+                mapping_dsl.field(field_name, self.TYPE_MAPPING[field_type]())
+        return mapping_dsl.to_dict()
+    @_elastic_connection
+    def add_mapping(self, index_name: str, schema: dict):
+        index = Index(name=index_name)
+        return index.put_mapping(body=schema, using=self.elasticsearch)
+    @_elastic_connection
+    def create_index(
+            self,
+            index: str,
+            shards: int = 3,
+            replicas: int = 1,
+            settings: Optional[dict] = None,
+    ) -> Dict | None:
+        """Creates empty index.
+        :param: index str: Name of the index to create.
+        :param: shards int: Number of shards for the index.
+        :param: replicas int: Number of replicas of the index.
+        :param: settings dict: Overwrite settings for the index.
+        """
+        index_exists = self.elasticsearch.indices.exists(index=index).body
+        if index_exists is False:
+            setting_body = settings or {
+                "number_of_shards": shards,
+                "number_of_replicas": replicas,
+            }
+            return self.elasticsearch.indices.create(index=index, settings=setting_body)
+    @_elastic_connection
+    def delete_index(self, index: str, ignore: Optional[bool] = True) -> Dict:
+        """Deletes index.
+        :param: index str: Name of the index to be deleted.
+        :param: ignore bool: Ignore errors because of closed/deleted index.
+        :return: Dict of Elastic's acknowledgement of the action.
+        """
+        response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore, expand_wildcards="open")
+        return response
+    @_elastic_connection
+    def delete_document(self, index: str, document_id: str) -> ObjectApiResponse[Any]:
+        """Deletes document fom index.
+        :param: document_id str: ID of the document to be deleted.
+        :param: index str: Index where the document is to be found.
+        :param: ignore bool: Ignore errors because of closed/deleted index.
+        :return: Dict of Elastic's acknowledgement of the action.
+        """
+        response = self.elasticsearch.delete(id=document_id, index=index)
+        return response
+    @_elastic_connection
+    def bulk_index(
+            self,
+            documents: Iterator[dict],
+            index_prefix: str,
+            rollover_limit: int,
+            refresh="false",
+            create_index: bool = True
+    ) -> (int, int):
+        last_index_name = self._produce_rollover_index(index_prefix, rollover_limit)
+        if create_index:
+            response = self.create_index(index=last_index_name)
+            response = self.add_mapping(index_name=last_index_name, schema=self.generate_mapping())
+            pass
+        actions = [{"_index": last_index_name, "_source": document} for document in documents]
+        successful_count, error_count = bulk(actions=actions, client=self.elasticsearch, max_retries=3, refresh=refresh)
+        return successful_count, error_count
+    @_elastic_connection
+    def index_document(self, index: str, body: dict, document_id: Optional[str] = None) -> Dict:
+        """Indexes document.
+        :param: index str: Index that document will be indexed into.
+        :param: body dict: Document body.
+        :param: document_id str: Optional id for the document. Is generated automatically if None.
+        :return: Dict of Elastic's acknowledgement of the action.
+        """
+        if document_id:
+            indexed = self.elasticsearch.index(index=index, id=document_id, body=body)
+        else:
+            indexed = self.elasticsearch.index(index=index, body=body)
+        return indexed
+    @_elastic_connection
+    def get_documents_by_key(self, index: str, document_key: str, sort_fields=("start_page", "end_page", "sequence_nr",)):
+        index = f"{index}-*"
+        s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
+        s = s.query("match", parent_id=document_key).sort(*sort_fields)
+        # Since scan doesn't allow for sorting, we do it manually after fetching the documents.
+        documents = sorted(
+            s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
+        )
+        return documents
+    def __str__(self) -> str:
+        return self.elasticsearch_url

{rara_tools-0.0.4 → rara_tools-0.0.8/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rara-tools
-Version: 0.0.4
+Version: 0.0.8
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools.egg-info/SOURCES.txt RENAMED Viewed

@@ -13,6 +13,9 @@ rara_tools.egg-info/SOURCES.txt
 rara_tools.egg-info/dependency_links.txt
 rara_tools.egg-info/requires.txt
 rara_tools.egg-info/top_level.txt
+rara_tools/constants/__init__.py
+rara_tools/constants/digitizer.py
+rara_tools/constants/general.py
 tests/test_elastic.py
 tests/test_s3_exceptions.py
 tests/test_s3_file_operations.py

rara_tools-0.0.8/tests/test_elastic.py ADDED Viewed

@@ -0,0 +1,132 @@
+import json
+import os
+import time
+import uuid
+from time import sleep
+import pytest
+from rara_tools.elastic import KataElastic
+with open("./tests/test_data/elastic_docs.json") as fh:
+    TEST_DOCUMENTS = json.load(fh)
+es_url = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
+ELASTIC = KataElastic(es_url)
+ELASTIC_BAD = KataElastic("http://locallost:9012")
+TEST_INDEX_NAME = "tools_testing_index"
+TEST_DOCUMENT_ID = None
+TEST_DOCUMENT_INDEX = None
+PARENT_ID = uuid.uuid4().hex
+@pytest.mark.order(1)
+def test_index_creation():
+    """ Tests if index created and documents indexed.
+    """
+    # Create test index
+    created = ELASTIC.create_index(TEST_INDEX_NAME)
+    assert created["acknowledged"] is True
+    time.sleep(2)
+@pytest.mark.order(2)
+def test_check():
+    """Tests health check method.
+    """
+    assert ELASTIC.check() is True
+    # test bad connection
+    assert ELASTIC_BAD.check() is False
+@pytest.mark.order(2)
+def test_creating_index_again():
+    """
+    Test to see that running the function for index generation doesn't trigger errors
+    on duplicates.
+    """
+    # Create test index
+    created = ELASTIC.create_index(TEST_INDEX_NAME)
+    assert created is None
+@pytest.mark.order(3)
+def test_adding_mapping_to_index():
+    """Test adding mapping to an index"""
+    schema = ELASTIC.generate_mapping()
+    result = ELASTIC.add_mapping(TEST_INDEX_NAME, schema)
+    assert result["acknowledged"] is True
+    # Test adding the mapping again doesn't create errors.
+    result = ELASTIC.add_mapping(TEST_INDEX_NAME, schema)
+    assert result["acknowledged"] is True
+@pytest.mark.order(4)
+def test_document_addition():
+    # Add test documents
+    for document in TEST_DOCUMENTS:
+        indexed = ELASTIC.index_document(TEST_INDEX_NAME, document)
+        assert indexed["result"] == "created"
+    # let it index
+    sleep(1)
+@pytest.mark.order(5)
+def test_bulk_indexing_documents_cause_rollover():
+    data = [{"start_page": number, "sequence_nr": 1, "end_page": number, "parent_id": PARENT_ID} for number in range(10)]
+    chunks = [data[i:i + 3] for i in range(0, len(data), 3)]
+    for chunk in chunks:
+        success, errors = ELASTIC.bulk_index(chunk, TEST_INDEX_NAME, rollover_limit=3, refresh="wait_for")
+        assert success is 3 or success is 1
+    created_indices = ELASTIC.elasticsearch.indices.get(index=f"{TEST_INDEX_NAME}-*", expand_wildcards="open").body
+    assert len(created_indices) == 4
+@pytest.mark.order(6)
+def test_bulk_indexing_and_document_fetch():
+    """
+    Test that the whole process of indexing a bunch of different texts and then the retrieval
+    of only the requested documents works as intended.
+    """
+    success, errors = ELASTIC.bulk_index(TEST_DOCUMENTS, TEST_INDEX_NAME, rollover_limit=3, refresh="wait_for")
+    # Test the integrity of the limiting query.
+    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "foo")
+    assert len(result) == 2
+    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
+    global TEST_DOCUMENT_ID
+    global TEST_DOCUMENT_INDEX
+    TEST_DOCUMENT_ID = result[0].meta.id
+    TEST_DOCUMENT_INDEX = result[0].meta.index
+    assert len(result) == 1
+    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "loll")
+    assert len(result) == 0
+    # Check that sorting works as expected.
+    results = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, PARENT_ID)
+    for index, document in enumerate(results):
+        assert document.start_page == index
+@pytest.mark.order(7)
+def test_document_deleting():
+    """
+    Tests deleting a document from index.
+    """
+    deleted = ELASTIC.delete_document(TEST_DOCUMENT_INDEX, TEST_DOCUMENT_ID)
+    assert deleted["result"] == "deleted"
+    sleep(1)
+    # check if document was actually deleted
+    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
+    assert len(result) == 0
+@pytest.mark.order(8)
+def test_index_deleting():
+    """
+    Tests deleting index. We delete the test index now.
+    """
+    deleted = ELASTIC.delete_index(TEST_INDEX_NAME)
+    for i in range(10):
+        ELASTIC.delete_index(f"{TEST_INDEX_NAME}-{i}")
+    assert deleted["acknowledged"] is True

rara_tools-0.0.4/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.0.4

rara_tools-0.0.4/rara_tools/elastic.py DELETED Viewed

@@ -1,92 +0,0 @@
-from typing import Dict, Optional, List
-from elasticsearch import Elasticsearch
-from elasticsearch_dsl import Search
-from .decorators import _elastic_connection
-class KataElastic:
-    """A class to manage all required Elasticsearch operations for Kata.
-    """
-    def __init__(self, elasticsearch_url: str, timeout: Optional[int] = None):
-        self.timeout = timeout
-        self.elasticsearch_url = elasticsearch_url
-        self.elasticsearch = Elasticsearch(self.elasticsearch_url, request_timeout=self.timeout)
-    @_elastic_connection
-    def check(self) -> bool:
-        """Checks Elasticsearch connection.
-        :return: bool: Elasticsearch alive or dead.
-        """
-        if self.elasticsearch.ping():
-            return True
-        return False
-    @_elastic_connection
-    def create_index(
-            self,
-            index: str,
-            shards: int = 3,
-            replicas: int = 1,
-            settings: Optional[dict] = None
-    ) -> Dict:
-        """Creates empty index.
-        :param: index str: Name of the index to create.
-        :param: shards int: Number of shards for the index.
-        :param: replicas int: Number of replicas of the index.
-        :param: settings dict: Overwrite settings for the index.
-        """
-        body = settings or {
-            "number_of_shards": shards,
-            "number_of_replicas": replicas,
-        }
-        return self.elasticsearch.indices.create(index=index, settings=body)
-    @_elastic_connection
-    def delete_index(self, index: str, ignore: Optional[bool] = True) -> Dict:
-        """Deletes index.
-        :param: index str: Name of the index to be deleted.
-        :param: ignore bool: Ignore errors because of closed/deleted index.
-        :return: Dict of Elastic's acknowledgement of the action.
-        """
-        response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore)
-        return response
-    @_elastic_connection
-    def delete_document(self, index: str, document_id: str) -> Dict:
-        """Deletes document fom index.
-        :param: document_id str: ID of the document to be deleted.
-        :param: index str: Index where the document is to be found.
-        :param: ignore bool: Ignore errors because of closed/deleted index.
-        :return: Dict of Elastic's acknowledgement of the action.
-        """
-        response = self.elasticsearch.delete(id=document_id, index=index)
-        return response
-    @_elastic_connection
-    def index_document(self, index: str, body: dict, document_id: Optional[str] = None) -> Dict:
-        """Indexes document.
-        :param: index str: Index that document will be indexed into.
-        :param: body dict: Document body.
-        :param: document_id str: Optional id for the document. Is generated automatically if None.
-        :return: Dict of Elastic's acknowledgement of the action.
-        """
-        if document_id:
-            indexed = self.elasticsearch.index(index=index, id=document_id, body=body)
-        else:
-            indexed = self.elasticsearch.index(index=index, body=body)
-        return indexed
-    @_elastic_connection
-    def get_documents_by_key(self, index: str, document_key: str) -> List:
-        """This method is for retrieving all texts/pages of the original document.
-        :param: index str: Index to search the documents from.
-        :param: document_key str: parent_id field that connects pages of document together.
-        :return: List of matching documents.
-        """
-        s = Search(using=self.elasticsearch, index=index)
-        docs = s.query("match", parent_id=document_key).execute()
-        return docs
-    def __str__(self) -> str:
-         return self.elasticsearch_url

rara_tools-0.0.4/tests/test_elastic.py DELETED Viewed

@@ -1,69 +0,0 @@
-import pytest
-import json
-import os
-from time import sleep
-from rara_tools.elastic import KataElastic
-with open("./tests/test_data/elastic_docs.json") as fh:
-    TEST_DOCUMENTS = json.load(fh)
-es_url = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
-ELASTIC = KataElastic(es_url)
-ELASTIC_BAD = KataElastic("http://locallost:9012")
-TEST_INDEX_NAME = "tools_testing_index"
-TEST_DOCUMENT_ID = None
-@pytest.mark.order(1)
-def test_index_creation_and_data_indexing():
-    """ Tests if index created and documents indexed.
-    """
-    # Create test index
-    created = ELASTIC.create_index(TEST_INDEX_NAME)
-    assert created["acknowledged"] is True
-    # Add test documents
-    for document in TEST_DOCUMENTS:
-        indexed = ELASTIC.index_document(TEST_INDEX_NAME, document)
-        assert indexed["result"] == "created"
-    # let it index
-    sleep(1)
-@pytest.mark.order(2)
-def test_check():
-    """Tests health check method.
-    """
-    assert ELASTIC.check() is True
-    # test bad connection
-    assert ELASTIC_BAD.check() is False
-@pytest.mark.order(3)
-def test_get_document_by_key():
-    """Tests if correct documents fetched.
-    """
-    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "foo")
-    assert len(result) == 2
-    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
-    global TEST_DOCUMENT_ID
-    TEST_DOCUMENT_ID = result[0].meta.id
-    assert len(result) == 1
-    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "loll")
-    assert len(result) == 0
-@pytest.mark.order(5)
-def test_document_deleting():
-    """ Tests deleting a document from index.
-    """
-    deleted = ELASTIC.delete_document(TEST_INDEX_NAME, TEST_DOCUMENT_ID)
-    assert deleted["result"] == "deleted"
-    sleep(1)
-    # check if document was actually deleted
-    result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
-    assert len(result) == 0
-@pytest.mark.order(5)
-def test_index_deleting():
-    """ Tests deleting index. We delete the test index now.
-    """
-    deleted = ELASTIC.delete_index(TEST_INDEX_NAME)
-    assert deleted["acknowledged"] is True

{rara_tools-0.0.4 → rara_tools-0.0.8}/LICENSE.md RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/README.md RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools/decorators.py RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools/exceptions.py RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools/s3.py RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools/task_reporter.py RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools.egg-info/requires.txt RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/rara_tools.egg-info/top_level.txt RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/requirements.txt RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/setup.cfg RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/tests/test_s3_exceptions.py RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/tests/test_s3_file_operations.py RENAMED Viewed

File without changes

{rara_tools-0.0.4 → rara_tools-0.0.8}/tests/test_task_reporter.py RENAMED Viewed

File without changes

rara-tools 0.0.4__tar.gz → 0.0.8__tar.gz

Potentially problematic release.

rara-tools 0.0.4tar.gz → 0.0.8tar.gz