PyPI - aatm - Versions diffs - 0.1.0__py3-none-any.whl - Mend

aatm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

aatm/__init__.py +3 -0
aatm/aio/__init__.py +0 -0
aatm/aio/selectors.py +173 -0
aatm/aio/translators.py +79 -0
aatm/api/__init__.py +0 -0
aatm/api/config.py +68 -0
aatm/api/data_models.py +85 -0
aatm/api/main.py +160 -0
aatm/data_models.py +921 -0
aatm/debug.py +91 -0
aatm/embedding_functions.py +290 -0
aatm/extractors.py +199 -0
aatm/local_database_utils.py +334 -0
aatm/logs.py +111 -0
aatm/main.py +658 -0
aatm/omop/__init__.py +4 -0
aatm/omop/condition_occurrence.py +227 -0
aatm/omop/device_exposure.py +149 -0
aatm/omop/drug_exposure.py +249 -0
aatm/omop/registry.py +23 -0
aatm/pipeline.py +78 -0
aatm/prompt_helpers.py +48 -0
aatm/registries/__init__.py +0 -0
aatm/registries/rerankers.py +114 -0
aatm/registries/retrievers.py +175 -0
aatm/registries/selectors.py +157 -0
aatm/registries/translators.py +93 -0
aatm/rerankers.py +368 -0
aatm/retrievers.py +200 -0
aatm/search_ui.py +129 -0
aatm/selectors.py +448 -0
aatm/sql_commands.yaml +78 -0
aatm/terminology_mapper.py +594 -0
aatm/time.py +18 -0
aatm/translators.py +294 -0
aatm-0.1.0.dist-info/METADATA +241 -0
aatm-0.1.0.dist-info/RECORD +40 -0
aatm-0.1.0.dist-info/WHEEL +4 -0
aatm-0.1.0.dist-info/entry_points.txt +3 -0
aatm-0.1.0.dist-info/licenses/LICENSE +21 -0

aatm/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .logs import configure_logging
+configure_logging()

aatm/aio/__init__.py ADDED Viewed

File without changes

aatm/aio/selectors.py ADDED Viewed

@@ -0,0 +1,173 @@
+import asyncio
+from typing import List
+import dotenv
+from openai import AsyncOpenAI
+from google.genai import types
+from aatm.data_models import (
+    EmptySelectionMetadata,
+    RetrieverResults,
+    SelectedExpressionMetadata,
+    SelectedResult,
+    SelectorResults,
+)
+from aatm.prompt_helpers import format_prompt
+from aatm.selectors import OpenAILLMSelector, GeminiLLMSelector
+from aatm.debug import DebugMode, get_debug_mode
+from aatm.logs import get_logger
+# Load environment variables
+dotenv.load_dotenv()
+logger = get_logger(__name__)
+debug_mode = get_debug_mode()
+class AsyncOpenAILLMSelector(OpenAILLMSelector):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.client = AsyncOpenAI()
+    async def select(self, results: RetrieverResults) -> SelectorResults:
+        selector_results = SelectorResults(results=[], queries=results.queries)
+        async with asyncio.TaskGroup() as tg:
+            tasks = []
+            for query_id, query in enumerate(results.queries):
+                prompt = format_prompt(
+                    self.prompt_template,
+                    {
+                        "json_format": SelectedResult.model_json_schema(),
+                        "query": query.capitalize(),  # avoid case sensitivity for some queries like 'cough' and 'Cough'
+                        "expressions": [
+                            r.to_prompt_object() for r in results.results[query_id]
+                        ],
+                    },
+                )
+                tasks.append(
+                    tg.create_task(
+                        self.client.responses.parse(
+                            model=self.model_id,
+                            input=prompt,
+                            text_format=SelectedResult,
+                        )
+                    )
+                )
+        responses = [t.result() for t in tasks]
+        for response, (query_id, query) in zip(responses, enumerate(results.queries)):
+            selected_result: SelectedResult = response.output_parsed
+            assert isinstance(selected_result, SelectedResult), (
+                f"Expected SelectedResult object from OpenAI, but got {type(selected_result)}"
+            )
+            # case where no expression is selected
+            if selected_result.expression_id is None:
+                selector_results.results.append(EmptySelectionMetadata())
+                continue
+            # case where expression is selected but it is not in the results
+            results_expression_ids = set(
+                [r.expression_id for r in results.results[query_id]]
+            )
+            if selected_result.expression_id not in results_expression_ids:
+                selector_results.results.append(EmptySelectionMetadata())
+                continue
+            # case where expression is selected and it is in the results
+            for result_idx, r in enumerate(results.results[query_id]):
+                if r.expression_id == selected_result.expression_id:
+                    selector_results.results.append(
+                        SelectedExpressionMetadata(
+                            **r.model_dump(), result_list_index=result_idx
+                        )
+                    )
+                    break
+        return selector_results
+class AsyncGeminiLLMSelector(GeminiLLMSelector):
+    async def select(self, results: RetrieverResults) -> SelectorResults:
+        selector_results = SelectorResults(results=[], queries=results.queries)
+        async with asyncio.TaskGroup() as tg:
+            tasks = []
+            for query_id, query in enumerate(results.queries):
+                prompt = format_prompt(
+                    self.prompt_template,
+                    {
+                        "json_format": SelectedResult.model_json_schema(),
+                        "query": query.capitalize(),  # avoid case sensitivity for some queries like 'cough' and 'Cough'
+                        "expressions": [
+                            r.to_prompt_object() for r in results.results[query_id]
+                        ],
+                    },
+                )
+                gemini_formatted_prompt = []
+                for msg in prompt:
+                    gemini_formatted_prompt.append(types.UserContent(msg["content"]))
+                if debug_mode == DebugMode.GEMINI_LLM_SELECTOR:
+                    logger.debug(prompt)
+                    logger.debug(gemini_formatted_prompt)
+                tasks.append(
+                    tg.create_task(
+                        self.client.aio.models.generate_content(
+                            model=self.model_id,
+                            contents=gemini_formatted_prompt,
+                            config=types.GenerateContentConfig(
+                                response_mime_type="application/json",
+                                response_schema=SelectedResult,
+                            ),
+                        )
+                    )
+                )
+        responses: List[types.GenerateContentResponse] = [t.result() for t in tasks]
+        for response, (query_id, query) in zip(responses, enumerate(results.queries)):
+            selected_result: SelectedResult = SelectedResult.model_validate_json(
+                response.text
+            )
+            if debug_mode == DebugMode.GEMINI_LLM_SELECTOR:
+                logger.debug(response)
+                logger.debug(response.text)
+                logger.debug(SelectedResult.model_validate_json(response.text))
+            assert isinstance(selected_result, SelectedResult), (
+                f"Expected SelectedResult object from Gemini, but got {type(selected_result)}"
+            )
+            # case where no expression is selected
+            if selected_result.expression_id is None:
+                selector_results.results.append(EmptySelectionMetadata())
+                continue
+            # case where expression is selected but it is not in the results
+            results_expression_ids = set(
+                [r.expression_id for r in results.results[query_id]]
+            )
+            if selected_result.expression_id not in results_expression_ids:
+                selector_results.results.append(EmptySelectionMetadata())
+                continue
+            # case where expression is selected and it is in the results
+            for result_idx, r in enumerate(results.results[query_id]):
+                if r.expression_id == selected_result.expression_id:
+                    selector_results.results.append(
+                        SelectedExpressionMetadata(
+                            **r.model_dump(), result_list_index=result_idx
+                        )
+                    )
+                    break
+        return selector_results

aatm/aio/translators.py ADDED Viewed

@@ -0,0 +1,79 @@
+import asyncio
+import json
+from google.genai import types
+from typing import List
+from openai import AsyncOpenAI
+# Custom modules
+from aatm.data_models import Translation
+from aatm.prompt_helpers import format_prompt
+from aatm.translators import BaseTranslator, GeminiTranslator, OpenAITranslator
+class EmptyTranslator(BaseTranslator):
+    async def translate(self, texts: List[str]) -> List[Translation]:
+        return [Translation(text=t) for t in texts]
+class AsyncGeminiTranslator(GeminiTranslator):
+    async def translate(self, texts: List[str]) -> List[Translation]:
+        async with asyncio.TaskGroup() as tg:
+            tasks = [
+                tg.create_task(
+                    self.client.aio.models.generate_content(
+                        model=self.model,
+                        contents=self.prompt_template.format(text=t),
+                        config=types.GenerateContentConfig(
+                            response_mime_type="application/json",
+                            response_schema=Translation,
+                        ),
+                    )
+                )
+                for t in texts
+            ]
+        results = [t.result() for t in tasks]
+        processed_results = []
+        for result, t in zip(results, texts):
+            try:
+                processed_results.append(Translation(**json.loads(result.text)))
+            except Exception as e:
+                print(
+                    f"Error while processing text '{t}' and response '{result}': {e}. Original text was maintained."
+                )
+                processed_results.append(Translation(text=t))
+        return processed_results
+class AsyncOpenAITranslator(OpenAITranslator):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.client = AsyncOpenAI()
+    async def translate(self, texts):
+        async with asyncio.TaskGroup() as tg:
+            tasks = [
+                tg.create_task(
+                    self.client.responses.parse(
+                        model=self.model_id,
+                        input=format_prompt(self.prompt_template, {"text": t}),
+                        text_format=Translation,
+                    )
+                )
+                for t in texts
+            ]
+        results = [t.result() for t in tasks]
+        processed_results = []
+        for result, t in zip(results, texts):
+            try:
+                processed_results.append(result.output_parsed)
+            except Exception as e:
+                print(
+                    f"Error while processing text '{t}' and response '{result}': {e}. Original text was maintained."
+                )
+                processed_results.append(Translation(text=t))
+        return processed_results

aatm/api/__init__.py ADDED Viewed

File without changes

aatm/api/config.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Configuration model and persistence utilities for the AATM API.
+This module defines the `APIConfig` model, which stores runtime configuration
+for the API server and provides helper methods to save the configuration to
+disk and load it back from a YAML file.
+"""
+from pathlib import Path
+from typing import Optional
+from pydantic import BaseModel, ConfigDict
+import yaml
+class APIConfig(BaseModel):
+    """Configuration model for the AATM API server.
+    This model stores the runtime settings used to serve the API, including host,
+    port, batching behavior, optional rate limiting, and worker configuration. It
+    also supports persistence to and from a YAML file.
+    Attributes:
+        DEFAULT_PATH: Default filesystem path used to save and load the API
+            configuration.
+        host: Host interface on which the API server listens.
+        port: Port on which the API server listens.
+        batch_size: Batch size used by the API processing pipeline.
+        rate_limit: Optional maximum number of documents allowed per minute.
+        workers: Optional number of worker processes.
+    """
+    DEFAULT_PATH: Path = Path(".aatm/api_config.yaml")
+    host: str
+    port: str
+    batch_size: int
+    rate_limit: Optional[int] = None
+    workers: Optional[int] = None
+    model_config = ConfigDict(extra="allow")
+    def save_to_disk(self, path: str | Path = DEFAULT_PATH) -> None:
+        """Save the API configuration to a YAML file on disk.
+        Args:
+            path: Destination path where the configuration should be written. If not
+                provided, the default configuration path is used.
+        Returns:
+            None.
+        """
+        if isinstance(path, str):
+            path = Path(path)
+        path.write_text(yaml.safe_dump(self.model_dump(mode="json")))
+    @classmethod
+    def load_from_disk(cls, path: str | Path = DEFAULT_PATH) -> "APIConfig":
+        """Load the API configuration from a YAML file on disk.
+        Args:
+            path: Path to the YAML configuration file. If not provided, the default
+                configuration path is used.
+        Returns:
+            An `APIConfig` instance initialized from the contents of the file.
+        """
+        if isinstance(path, str):
+            path = Path(path)
+        return cls(**yaml.safe_load(path.read_text()))

aatm/api/data_models.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Request models for the terminology mapping API.
+This module defines Pydantic models used to validate incoming API requests for
+terminology mapping and retrieval. It includes request schemas for mapping
+source concepts through the terminology pipeline and for performing retriever-
+based searches.
+"""
+from typing import Any, List, Optional
+from fastapi import HTTPException
+from pydantic import BaseModel, Field, field_validator
+from aatm.data_models import SourceConcept
+class TerminologyMappingRequest(BaseModel):
+    """Request model for terminology mapping operations.
+    This model encapsulates the list of source concepts to be mapped together with
+    the optional identifiers of the pipeline components used during the mapping
+    workflow.
+    Attributes:
+        source_concepts: Source concepts to map to a target terminology.
+        translator_id: Optional identifier of the translator component.
+        retriever_id: Optional identifier of the retriever component.
+        selector_id: Optional identifier of the selector component.
+        reranker_id: Optional identifier of the reranker component.
+    """
+    source_concepts: List[SourceConcept]
+    translator_id: Optional[str] = Field(None, examples=["gemini-2.5-flash"])
+    retriever_id: Optional[str] = Field(None, examples=["embeddinggemma-300M"])
+    selector_id: Optional[str] = Field(None, examples=["first-result-selector"])
+    reranker_id: Optional[str] = Field(None, examples=[None])
+    @field_validator("source_concepts", mode="after")
+    def validate_source_concepts(cls, v: List[SourceConcept]):
+        """Validate the list of source concepts provided in the request.
+        This validator ensures that at least one source concept is provided and that
+        all source concepts include a `source_code_description` value.
+        Args:
+            v: List of source concepts to validate.
+        Returns:
+            The validated list of source concepts.
+        Raises:
+            HTTPException: If no source concepts are provided.
+            HTTPException: If one or more source concepts are missing the
+                `source_code_description` field.
+        """
+        if len(v) == 0:
+            raise HTTPException(status_code=400, detail="No source concepts provided")
+        incomplete_source_concepts = [
+            source_concept
+            for source_concept in v
+            if source_concept.source_code_description is None
+        ]
+        if len(incomplete_source_concepts) > 0:
+            raise HTTPException(
+                status_code=400,
+                detail=f"The field source_code_description is required for all source concepts. A total of {len(incomplete_source_concepts)} source concepts are missing this field.",
+            )
+        return v
+class SearchRequest(BaseModel):
+    """Request model for retriever-based search operations.
+    Attributes:
+        queries: List of query strings to search for.
+        retriever_id: Identifier of the retriever to use.
+        top_k: Maximum number of results to return per query.
+        where: Optional metadata filter applied during retrieval.
+    """
+    queries: List[str] = Field(..., examples=[["Cardiovascular disease"]])
+    retriever_id: str = Field(..., examples=["embeddinggemma-300M"])
+    top_k: int = Field(10, examples=[10])
+    where: dict[str, Any] | None = Field(None, examples=[None])

aatm/api/main.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""FastAPI application for terminology mapping and retrieval workflows.
+This module defines API endpoints for terminology mapping and retrieval, along
+with an in-memory least-recently-used registry for caching pipeline components.
+The registry avoids repeatedly instantiating expensive retrievers and
+terminology mappers across requests.
+"""
+from collections import OrderedDict
+from typing import List
+from fastapi import FastAPI
+from aatm.api.config import APIConfig
+from aatm.api.data_models import SearchRequest, TerminologyMappingRequest
+from aatm.data_models import MappedSourceConcept, RetrieverResults
+from aatm.pipeline import PipelineBaseClass
+from aatm.registries.retrievers import load_retriever
+from aatm.retrievers import ChromaDBRetriever
+from aatm.terminology_mapper import TerminologyMapper
+app = FastAPI()
+api_config = APIConfig.load_from_disk()
+class ComponentRegistry:
+    """Least-recently-used in-memory registry for pipeline components.
+    This registry stores instantiated terminology mappers and other pipeline
+    components keyed by configuration tuples. When the registry reaches its maximum
+    capacity, the least recently used item is evicted.
+    Args:
+        max_size: Maximum number of cached components to retain.
+    Attributes:
+        max_size: Maximum number of cached components.
+        _store: Ordered mapping from cache keys to instantiated pipeline
+            components.
+    """
+    def __init__(self, max_size: int = 10):
+        """Initialize the component registry.
+        Args:
+            max_size: Maximum number of cached components to retain before evicting
+                the least recently used entry.
+        """
+        self.max_size = max_size
+        self._store: OrderedDict[tuple, TerminologyMapper | PipelineBaseClass] = (
+            OrderedDict()
+        )
+    def get(self, key: tuple) -> TerminologyMapper | PipelineBaseClass | None:
+        """Retrieve a cached component by key.
+        If the key is present, the corresponding component is marked as recently used
+        before being returned.
+        Args:
+            key: Cache key identifying the component.
+        Returns:
+            The cached `TerminologyMapper` or `PipelineBaseClass` instance associated
+            with the key, or `None` if the key is not present.
+        """
+        if key not in self._store:
+            return None
+        # Mark as recently used
+        self._store.move_to_end(key)
+        return self._store[key]
+    def set(self, key: tuple, value: TerminologyMapper | PipelineBaseClass) -> None:
+        """Store a component in the registry.
+        If the key already exists, the existing entry is updated and marked as recently
+        used. If the registry is full, the least recently used entry is removed before
+        adding the new component.
+        Args:
+            key: Cache key identifying the component.
+            value: Instantiated terminology mapper or pipeline component to cache.
+        Returns:
+            None.
+        """
+        if key in self._store:
+            self._store.move_to_end(key)
+            self._store[key] = value
+            return
+        if len(self._store) >= self.max_size:
+            # Remove least recently used item
+            self._store.popitem(last=False)
+        self._store[key] = value
+PIPELINE_COMPONENTS_REGISTRY = ComponentRegistry(max_size=20)
+@app.post("/map", response_model=List[MappedSourceConcept])
+def map(request: TerminologyMappingRequest):
+    """Map source concepts to target terminology concepts.
+    This endpoint retrieves or creates a `TerminologyMapper` instance based on the
+    requested pipeline component identifiers, then runs the mapping workflow over
+    the provided source concepts.
+    Args:
+        request: Terminology mapping request containing source concepts and the
+            identifiers of the translator, retriever, selector, and reranker
+            components.
+    Returns:
+        A list of mapped source concepts produced by the terminology mapping
+            pipeline.
+    """
+    tm_key = (
+        request.translator_id,
+        request.retriever_id,
+        request.selector_id,
+        request.reranker_id,
+    )
+    tm = PIPELINE_COMPONENTS_REGISTRY.get(tm_key)
+    if tm is None:
+        tm = TerminologyMapper.from_task_request(request, api_config)
+        PIPELINE_COMPONENTS_REGISTRY.set(tm_key, tm)
+    mapped_concepts = tm.map(
+        request.source_concepts,
+        save_to_disk=False,
+        return_as="mapped_source_concepts",
+    )
+    return mapped_concepts
+@app.post("/search", response_model=RetrieverResults)
+def search(request: SearchRequest) -> RetrieverResults:
+    """Search terminology candidates using a configured retriever.
+    This endpoint retrieves or creates a retriever instance based on the requested
+    retriever identifier, then executes the retrieval operation with the request
+    parameters.
+    Args:
+        request: Search request containing the retriever identifier and query
+            parameters.
+    Returns:
+        A `RetrieverResults` object containing the retrieval results.
+    """
+    retriever_key = f"retriever-{request.retriever_id}"
+    retriever = PIPELINE_COMPONENTS_REGISTRY.get(retriever_key)
+    if retriever is None:
+        retriever: ChromaDBRetriever = load_retriever(request.retriever_id)
+        PIPELINE_COMPONENTS_REGISTRY.set(retriever_key, retriever)
+    return retriever(**request.model_dump())