PyPI - rara-tools - Versions diffs - 0.0.10__tar.gz → 0.0.12__tar.gz - Mend

rara-tools 0.0.10tar.gz → 0.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (31) hide show

{rara_tools-0.0.10/rara_tools.egg-info → rara_tools-0.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rara-tools
-Version: 0.0.10
+Version: 0.0.12
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara_tools-0.0.12/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.12

{rara_tools-0.0.10 → rara_tools-0.0.12}/rara_tools/digar_schema_converter.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import List, NoReturn
 from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
-GENERAL_DOC_IDENTIFIER = "Filepath"
 UNDEFINED_LANGUAGE_VALUE = "unk"
 QUALITY_RATIO_TYPE = "Float"
@@ -104,13 +104,15 @@ class DocSchemas:
             generated_id: str = "",
             permalink: str = "",
             min_language_ratio: float = 0.2,
-            convert_ratio: bool = True
+            convert_ratio: bool = True,
+            generated_id_type: str = "CustomID"
     ) -> NoReturn:
         self.__convert_ratio = convert_ratio
         self.__min_language_ratio = min_language_ratio
         self.__sierra_id = sierra_id
         self.__generated_id = generated_id
         self.__permalink = permalink
+        self.__generated_id_type = generated_id_type
         self.__doc_meta = doc_meta
         self.__ocr_accuracy_schema: dict = {}
         self.__text_quality_schema: dict = {}
@@ -209,7 +211,7 @@ class DocSchemas:
                 identifiers.append(
                     {
                         "@type": "Identifier",
-                        "qualifier": GENERAL_DOC_IDENTIFIER,
+                        "qualifier": self.__generated_id_type,
                         "value": self.__generated_id
                     }
                 )
@@ -235,6 +237,7 @@ class DIGARSchemaConverter:
             generated_id: str,
             sierra_id: str = "",
             permalink: str = "",
+            generated_id_type: str = "CustomID",
             min_language_ratio: float = 0.2,
             convert_ratio: bool = False
     ) -> NoReturn:
@@ -250,6 +253,8 @@ class DIGARSchemaConverter:
             Document's corresponding Sierra ID.
         permalink: str
             Permanent link, where the document can be accessed.
+        generated_id_type: str
+            Method / type of generated ID (e.g. 'UUID')
         min_language_ratio: float
             Cutoff ratio for languages. If ratio for some language
             does not exceed the set threshold, the language will not
@@ -264,6 +269,7 @@ class DIGARSchemaConverter:
         self.__sierra_id: str = sierra_id
         self.__generated_id: str = generated_id
         self.__permalink: str = permalink.removesuffix("/")
+        self.__generated_id_type: str = generated_id_type
         self.__texts: List[dict] = []
         self.__images: List[dict] = []
         self.__doc_meta: dict = {}
@@ -281,7 +287,8 @@ class DIGARSchemaConverter:
             generated_id=self.__generated_id,
             permalink=self.__permalink,
             min_language_ratio=self.__min_language_ratio,
-            convert_ratio=self.__convert_ratio
+            convert_ratio=self.__convert_ratio,
+            generated_id_type=self.__generated_id_type
         )
         self.__digar_schema: dict = {}

{rara_tools-0.0.10 → rara_tools-0.0.12}/rara_tools/elastic.py RENAMED Viewed

@@ -1,10 +1,11 @@
-from typing import Any, Dict, Iterator, Optional
+from typing import Any, Dict, Iterator, Optional, List
 import elasticsearch_dsl
 from elastic_transport import ObjectApiResponse
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk
 from elasticsearch_dsl import Index
+from elasticsearch_dsl.response import Response
 from .decorators import _elastic_connection
@@ -82,6 +83,63 @@ class KataElastic:
     def add_mapping(self, index_name: str, schema: dict):
         index = Index(name=index_name)
         return index.put_mapping(body=schema, using=self.elasticsearch)
+    @_elastic_connection
+    def add_vector_mapping(
+            self,
+            index_name: str,
+            field: str,
+            schema: Optional[dict] = None,
+            dims: int = 1024
+    ) -> dict:
+        vector_mapping = {
+            "properties": {
+                field: {
+                    "type": "dense_vector",
+                    "dims": dims
+                }
+            }
+        }
+        mapping = schema or vector_mapping
+        index = Index(name=index_name)
+        return index.put_mapping(body=mapping, using=self.elasticsearch)
+    @_elastic_connection
+    def add_ann_vector_mapping(
+            self,
+            index_name: str,
+            field: str,
+            schema: Optional[dict] = None,
+            dims: int = 1024
+    ) -> dict:
+        vector_mapping = {
+            "properties": {
+                field: {
+                    "type": "dense_vector",
+                    "dims": dims,
+                    "similarity": "cosine",
+                    "index": True
+                }
+            }
+        }
+        mapping = schema or vector_mapping
+        index = Index(name=index_name)
+        return index.put_mapping(body=mapping, using=self.elasticsearch)
+    @_elastic_connection
+    def add_vector(
+            self,
+            index_name: str,
+            document_id: str,
+            vector: List[float],
+            field: str
+    ) -> dict:
+        schema = {"doc": {field: vector}}
+        return self.elasticsearch.update(
+            index=index_name, id=document_id, body=schema, refresh="wait_for"
+        )
     @_elastic_connection
     def create_index(
@@ -170,6 +228,130 @@ class KataElastic:
             s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
         )
         return documents
+    @_elastic_connection
+    def execute_fuzzy_search(
+            self,
+            index: str,
+            field: str,
+            entity: str,
+            fuzziness: int = 2,
+            prefix_length: int = 1,
+            max_expansions: int = 50
+    ) -> Response:
+        """Executes a fuzzy search.
+        :param: index str: Index to search from.
+        :param: entity str: Entity to search matches for.
+        :param: fuzziness int: Maximum edit distance for a match.
+        :param: prefix_length int: Number of characters in the prefix that
+            should overlap with the original entity's prefix.
+        :param: max_expansion int: maximum number of terms the fuzzy query
+            will match before halting the search
+        :return: Dict on search results.
+        """
+        query_params = {
+            f"{field}.keyword": {
+                "value": entity,
+                "fuzziness": fuzziness,
+                "max_expansions": max_expansions,
+                "prefix_length": prefix_length
+            }
+        }
+        s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
+        s = s.query("fuzzy", **query_params)
+        response = s.execute()
+        return response
+    def execute_ann_vector_search(
+            self,
+            index: str,
+            field: str,
+            query_vector: List[float],
+            k: int = 10,
+            num_candidates: int = 100,
+            n_docs: int = 10,
+            elastic_ids: List[str] = []
+    ) -> Response:
+        """ Execute a vector search.
+        NB! Works only with ANN mapping!
+        :param: index str: Index to search from.
+        :param: field str: Field containing vectorized data.
+        :param: query vector List[float]: Vector to search matches for.
+        :param: k int: Number of nearest neighbors to return.
+        :param: num_candidates int: Number of candidates considered before selecting k results.
+        :param: n_docs: int: Number of documents to return.
+        :param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
+        """
+        s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
+        # Add kNN vector search
+        s = s.extra(
+            knn={
+                "field": field,
+                "query_vector": query_vector,
+                "k": k,
+                "num_candidates": num_candidates
+            }
+        )
+        # Add ID filtering, if elastic_ids are specified
+        if elastic_ids:
+            s = s.query(
+                elasticsearch_dsl.Q("terms", _id=elastic_ids)
+            )
+        # Sort by score and return `n_docs` best-matching documents
+        s = s.extra(size=n_docs)
+        # Execute the search
+        response = s.execute()
+        return response
+    def execute_script_score_vector_search(
+            self,
+            index: str,
+            field: str,
+            query_vector: List[float],
+            n_docs: int = 10,
+            elastic_ids: List[str] = []
+    ) -> Response:
+        """ Execute a vector search.
+        NB! Requires different mapping than ANN!
+        :param: index str: Index to search from.
+        :param: field str: Field containing vectorized data.
+        :param: query vector List[float]: Vector to search matches for.
+        :param: n_docs: int: Number of documents to return.
+        :param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
+        """
+        s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
+        if elastic_ids:
+            query = elasticsearch_dsl.Q("terms", _id=elastic_ids)
+        else:
+            query = elasticsearch_dsl.Q("match_all")
+        # Apply script_score query
+        s = s.query(
+            "script_score",
+            query=query,
+            script={
+                "source": f"1.0 + cosineSimilarity(params.query_vector, '{field}')",
+                "params": {
+                    "query_vector": query_vector
+                }
+            }
+        )
+        # Set min_score and limit number of documents
+        s = s.extra(size=n_docs)
+        # Execute search
+        response = s.execute()
+        return response
     def __str__(self) -> str:
         return self.elasticsearch_url

{rara_tools-0.0.10 → rara_tools-0.0.12/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rara-tools
-Version: 0.0.10
+Version: 0.0.12
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

{rara_tools-0.0.10 → rara_tools-0.0.12}/rara_tools.egg-info/SOURCES.txt RENAMED Viewed

@@ -22,6 +22,7 @@ rara_tools/constants/general.py
 tests/test_converters.py
 tests/test_digar_schema_converter.py
 tests/test_elastic.py
+tests/test_elastic_vector_and_search_operations.py
 tests/test_s3_exceptions.py
 tests/test_s3_file_operations.py
 tests/test_task_reporter.py

rara_tools-0.0.12/tests/test_elastic_vector_and_search_operations.py ADDED Viewed

@@ -0,0 +1,167 @@
+import json
+import os
+import pytest
+from time import sleep
+from rara_tools.elastic import KataElastic
+def load_json(file_path: str):
+    with open(file_path, "r") as fh:
+        data = json.load(fh)
+    return data
+TEST_DOCUMENTS = load_json("./tests/test_data/elastic_vectorized_docs.json")
+TEST_VECTOR_DATA = load_json("./tests/test_data/test_vector_data.json")
+TEST_VECTOR = TEST_VECTOR_DATA.get("vector")
+es_url = os.getenv("ELASTIC_TEST_URL", "http://rara-elastic.texta.ee:9200")#http://localhost:9200")
+ELASTIC = KataElastic(es_url)
+TEST_KNN_INDEX_NAME = "tools_knn_testing_index"
+TEST_ANN_INDEX_NAME = "tools_ann_testing_index"
+TEST_VECTOR_FIELD = "vector"
+@pytest.mark.order(1)
+def test_index_creation_with_knn_vector_mapping():
+    """ Tests if index created and documents indexed.
+    """
+    # Create test index
+    created = ELASTIC.create_index(TEST_KNN_INDEX_NAME)
+    assert created["acknowledged"] is True
+    result = ELASTIC.add_vector_mapping(
+        index_name=TEST_KNN_INDEX_NAME,
+        field=TEST_VECTOR_FIELD
+    )
+    assert result["acknowledged"] is True
+@pytest.mark.order(2)
+def test_index_creation_with_ann_vector_mapping():
+    """ Tests if index created and documents indexed.
+    """
+    # Create test index
+    created = ELASTIC.create_index(TEST_ANN_INDEX_NAME)
+    assert created["acknowledged"] is True
+    result = ELASTIC.add_ann_vector_mapping(
+        index_name=TEST_ANN_INDEX_NAME,
+        field=TEST_VECTOR_FIELD
+    )
+    assert result["acknowledged"] is True
+@pytest.mark.order(3)
+def test_vectorized_document_addition_knn_index():
+    """ Tests indexing vectorized documents.
+    """
+    # Add test documents
+    for document in TEST_DOCUMENTS:
+        indexed = ELASTIC.index_document(TEST_KNN_INDEX_NAME, document)
+        assert indexed["result"] == "created"
+    # let it index
+    sleep(1)
+@pytest.mark.order(4)
+def test_vectorized_document_addition_ann_index():
+    """ Tests indexing vectorized documents.
+    """
+    # Add test documents
+    for document in TEST_DOCUMENTS:
+        indexed = ELASTIC.index_document(TEST_ANN_INDEX_NAME, document)
+        assert indexed["result"] == "created"
+    # let it index
+    sleep(1)
+@pytest.mark.order(5)
+def test_fuzzy_search():
+    """ Tests fuzzy search.
+    """
+    response = ELASTIC.execute_fuzzy_search(
+        index=TEST_ANN_INDEX_NAME,
+        field="variations",
+        entity="Paul Keres",
+        fuzziness=0
+    )
+    total_hits = response.hits.total.value
+    assert total_hits == 2
+    response = ELASTIC.execute_fuzzy_search(
+        index=TEST_ANN_INDEX_NAME,
+        field="variations",
+        entity="Paul Keres",
+        fuzziness=2
+    )
+    total_hits = response.hits.total.value
+    assert total_hits == 3
+@pytest.mark.order(6)
+def test_ann_vector_search():
+    """ Tests ANN vector search.
+    """
+    # Execut fuzzy search to get ID restrictions
+    response = ELASTIC.execute_fuzzy_search(
+        index=TEST_ANN_INDEX_NAME,
+        field="variations",
+        entity="Paul Keres",
+        fuzziness=2
+    )
+    total_hits = response.hits.total.value
+    assert total_hits == 3
+    elastic_ids = [hit.meta.id for hit in response]
+    response = ELASTIC.execute_ann_vector_search(
+        index=TEST_ANN_INDEX_NAME,
+        field="vector",
+        query_vector=TEST_VECTOR,
+        k=1,
+        n_docs=1,
+        num_candidates=10,
+        elastic_ids=elastic_ids
+    )
+    descriptions = [hit.description for hit in response]
+    assert len(descriptions) == 1
+    assert descriptions[0] == "Eesti maletaja ja maleteoreetik"
+@pytest.mark.order(7)
+def test_script_score_vector_search():
+    """ Tests ANN vector search.
+    """
+    # Execut fuzzy search to get ID restrictions
+    response = ELASTIC.execute_fuzzy_search(
+        index=TEST_KNN_INDEX_NAME,
+        field="variations",
+        entity="Paul Keres",
+        fuzziness=2
+    )
+    total_hits = response.hits.total.value
+    assert total_hits == 3
+    elastic_ids = [hit.meta.id for hit in response]
+    response = ELASTIC.execute_script_score_vector_search(
+        index=TEST_KNN_INDEX_NAME,
+        field="vector",
+        query_vector=TEST_VECTOR,
+        n_docs=1,
+        elastic_ids=elastic_ids
+    )
+    descriptions = [hit.description for hit in response]
+    assert len(descriptions) == 1
+    assert descriptions[0] == "Eesti maletaja ja maleteoreetik"
+@pytest.mark.order(8)
+def test_index_deleting():
+    """
+    Tests deleting index. We delete the test index now.
+    """
+    indices = [TEST_KNN_INDEX_NAME, TEST_ANN_INDEX_NAME]
+    for index in indices:
+        deleted = ELASTIC.delete_index(index)
+        sleep(1)
+        assert deleted["acknowledged"] is True