PyPI - datashare-python - Versions diffs - 0.1.0__py3-none-any.whl - Mend

datashare-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

datashare_python/__init__.py +0 -0
datashare_python/__main__.py +4 -0
datashare_python/app.py +85 -0
datashare_python/cli/__init__.py +30 -0
datashare_python/cli/tasks.py +182 -0
datashare_python/cli/utils.py +33 -0
datashare_python/config.py +60 -0
datashare_python/constants.py +6 -0
datashare_python/objects.py +49 -0
datashare_python/task_client.py +124 -0
datashare_python/tasks/__init__.py +2 -0
datashare_python/tasks/classify_docs.py +227 -0
datashare_python/tasks/dependencies.py +110 -0
datashare_python/tasks/translate_docs.py +223 -0
datashare_python/utils.py +69 -0
datashare_python-0.1.0.dist-info/METADATA +80 -0
datashare_python-0.1.0.dist-info/RECORD +19 -0
datashare_python-0.1.0.dist-info/WHEEL +4 -0
datashare_python-0.1.0.dist-info/entry_points.txt +4 -0

datashare_python/__init__.py ADDED Viewed

File without changes

datashare_python/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from datashare_python.cli import cli_app
+if __name__ == "__main__":
+    cli_app()

datashare_python/app.py ADDED Viewed

@@ -0,0 +1,85 @@
+from typing import Optional
+from icij_worker import AsyncApp
+from icij_worker.typing_ import PercentProgress
+from pydantic import parse_obj_as
+from datashare_python.constants import PYTHON_TASK_GROUP
+from datashare_python.objects import ClassificationConfig, TranslationConfig
+from datashare_python.tasks import (
+    classify_docs as classify_docs_,
+    create_classification_tasks as create_classification_tasks_,
+    create_translation_tasks as create_translation_tasks_,
+    translate_docs as translate_docs_,
+)
+from datashare_python.tasks.dependencies import APP_LIFESPAN_DEPS
+app = AsyncApp("ml", dependencies=APP_LIFESPAN_DEPS)
+@app.task(group=PYTHON_TASK_GROUP)
+async def create_translation_tasks(
+    project: str,
+    target_language: str,
+    config: dict | None = None,
+    user: dict | None = None,  # pylint: disable=unused-argument
+) -> list[str]:
+    # Parse the incoming config
+    config = parse_obj_as(Optional[TranslationConfig], config)
+    return await create_translation_tasks_(
+        project=project, target_language=target_language, config=config
+    )
+@app.task(group=PYTHON_TASK_GROUP)
+async def translate_docs(
+    docs: list[str],
+    project: str,
+    target_language: str,
+    progress: PercentProgress,
+    config: dict | None = None,
+    user: dict | None = None,  # pylint: disable=unused-argument
+) -> int:
+    config = parse_obj_as(Optional[TranslationConfig], config)
+    return await translate_docs_(
+        docs, target_language, project=project, config=config, progress=progress
+    )
+@app.task(group=PYTHON_TASK_GROUP)
+async def create_classification_tasks(
+    project: str,
+    language: str,
+    n_workers: int,
+    progress: PercentProgress,
+    config: dict | None = None,
+    user: dict | None = None,  # pylint: disable=unused-argument
+) -> list[str]:
+    config = parse_obj_as(Optional[ClassificationConfig], config)
+    return await create_classification_tasks_(
+        project=project,
+        language=language,
+        n_workers=n_workers,
+        config=config,
+        progress=progress,
+    )
+@app.task(group=PYTHON_TASK_GROUP)
+async def classify_docs(
+    docs: list[str],
+    language: str,
+    project: str,
+    progress: PercentProgress,
+    config: dict | None = None,
+    user: dict | None = None,  # pylint: disable=unused-argument
+) -> int:
+    config = parse_obj_as(Optional[ClassificationConfig], config)
+    return await classify_docs_(
+        docs, language=language, project=project, config=config, progress=progress
+    )
+@app.task(group=PYTHON_TASK_GROUP)
+def ping() -> str:
+    return "pong"

datashare_python/cli/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+import importlib.metadata
+from typing import Annotated, Optional
+import typer
+import datashare_python
+from datashare_python.cli.tasks import task_app
+from datashare_python.cli.utils import AsyncTyper
+cli_app = AsyncTyper(context_settings={"help_option_names": ["-h", "--help"]})
+cli_app.add_typer(task_app)
+def version_callback(value: bool):
+    if value:
+        package_version = importlib.metadata.version(datashare_python.__name__)
+        print(package_version)
+        raise typer.Exit()
+@cli_app.callback(name="datashare-python")
+def main(
+    version: Annotated[  # pylint: disable=unused-argument
+        Optional[bool],
+        typer.Option(  # pylint: disable=unused-argument
+            "--version", callback=version_callback, is_eager=True
+        ),
+    ] = None
+):
+    """Datashare Python CLI"""

datashare_python/cli/tasks.py ADDED Viewed

@@ -0,0 +1,182 @@
+import asyncio
+import json
+import logging
+import sys
+from pathlib import Path
+from traceback import FrameSummary, StackSummary
+from typing import Annotated, Any, Optional
+import typer
+from alive_progress import alive_bar
+from icij_worker import TaskState
+from icij_worker.objects import READY_STATES, Task, TaskError
+from datashare_python.cli.utils import AsyncTyper, eprint
+from datashare_python.constants import PYTHON_TASK_GROUP
+from datashare_python.task_client import DatashareTaskClient
+logger = logging.getLogger(__name__)
+DEFAULT_DS_ADDRESS = "http://localhost:8080"
+_ARGS_HELP = "task argument as a JSON string or file path"
+_GROUP_HELP = "task group"
+_DS_API_KEY_HELP = "datashare API key"
+_DS_URL_HELP = "datashare address"
+_POLLING_INTERVAL_S_HELP = "task state polling interval in seconds"
+_NAME_HELP = "registered task name"
+_RESULT_HELP = "get a task result"
+_START_HELP = "creates a new task and start it"
+_TASK_ID_HELP = "task ID"
+_WATCH_HELP = "watch a task until it's complete"
+TaskArgs = str
+task_app = AsyncTyper(name="task")
+@task_app.async_command(help=_START_HELP)
+async def start(
+    name: Annotated[str, typer.Argument(help=_NAME_HELP)],
+    args: Annotated[TaskArgs, typer.Argument(help=_ARGS_HELP)] = None,
+    group: Annotated[
+        Optional[str],
+        typer.Option("--group", "-g", help=_GROUP_HELP),
+    ] = PYTHON_TASK_GROUP.name,
+    ds_address: Annotated[
+        str, typer.Option("--ds-address", "-a", help=_DS_URL_HELP)
+    ] = DEFAULT_DS_ADDRESS,
+    ds_api_key: Annotated[
+        Optional[str], typer.Option("--ds-api-key", "-k", help=_DS_API_KEY_HELP)
+    ] = None,
+):
+    match args:
+        case str():
+            as_path = Path(name)
+            if as_path.exists():
+                args = json.loads(as_path.read_text())
+            else:
+                args = json.loads(args)
+        case None:
+            args = dict()
+        case _:
+            raise TypeError(f"Invalid args {args}")
+    client = DatashareTaskClient(ds_address, api_key=ds_api_key)
+    async with client:
+        task_id = await client.create_task(name, args, group=group)
+    eprint(f"Task({task_id}) started !")
+    eprint(f"Task({task_id}) 🛫")
+    print(task_id)
+@task_app.async_command(help=_WATCH_HELP)
+async def watch(
+    task_id: Annotated[str, typer.Argument(help=_TASK_ID_HELP)],
+    ds_address: Annotated[
+        str, typer.Option("--ds-address", "-a", help=_DS_URL_HELP)
+    ] = DEFAULT_DS_ADDRESS,
+    ds_api_key: Annotated[
+        Optional[str], typer.Option("--ds-api-key", "-k", help=_DS_API_KEY_HELP)
+    ] = None,
+    polling_interval_s: Annotated[
+        float, typer.Option("--polling-interval-s", "-p", help=_POLLING_INTERVAL_S_HELP)
+    ] = 1.0,
+):
+    client = DatashareTaskClient(ds_address, api_key=ds_api_key)
+    async with client:
+        task = await client.get_task(task_id)
+        if task.state is READY_STATES:
+            await _handle_ready(task, client, already_done=True)
+        await _handle_alive(task, client, polling_interval_s)
+    print(task_id)
+@task_app.async_command(help=_RESULT_HELP)
+async def result(
+    task_id: Annotated[str, typer.Argument(help=_TASK_ID_HELP)],
+    ds_address: Annotated[
+        str, typer.Option("--ds-address", "-a", help=_DS_URL_HELP)
+    ] = DEFAULT_DS_ADDRESS,
+    ds_api_key: Annotated[
+        Optional[str], typer.Option("--ds-api-key", "-k", help=_DS_API_KEY_HELP)
+    ] = None,
+) -> Any:
+    client = DatashareTaskClient(ds_address, api_key=ds_api_key)
+    async with client:
+        res = await client.get_task_result(task_id)
+        if isinstance(res, (dict, list)):
+            res = json.dumps(res, indent=2)
+        print(res)
+async def _handle_ready(
+    task: Task, client: DatashareTaskClient, already_done: bool = False
+) -> None:
+    match task.state:
+        case TaskState.ERROR:
+            await _handle_error(task, client)
+        case TaskState.CANCELLED:
+            await _handle_cancelled(task)
+        case TaskState.DONE:
+            if already_done:
+                await _handle_already_done(task)
+            else:
+                await _handle_done(task)
+        case _:
+            raise ValueError(f"Unexpected task state {task.state}")
+async def _handle_error(task, client: DatashareTaskClient):
+    error = await client.get_task_error(task.id)
+    eprint(
+        f"Task({task.id}) failed with the following"
+        f" error:\n\n{_format_error(error)}"
+    )
+    eprint(f"Task({task.id}) ❌")
+    raise typer.Exit(code=1)
+async def _handle_cancelled(task):
+    eprint(f"Task({task.id}) was cancelled !")
+    eprint(f"Task({task.id}) 🛑")
+    raise typer.Exit(code=1)
+async def _handle_already_done(task):
+    eprint(f"Task({task.id}) ✅ is already completed !")
+async def _handle_done(task):
+    eprint(f"Task({task.id}) 🛬")
+    eprint(f"Task({task.id}) ✅")
+async def _handle_alive(
+    task: Task, client: DatashareTaskClient, polling_interval_s: float
+) -> None:
+    title = f"Task({task.id}) 🛫"
+    stats = "(ETA: {eta})"
+    monitor = "{percent}"
+    progress_bar = alive_bar(
+        title=title, manual=True, stats=stats, monitor=monitor, file=sys.stderr
+    )
+    with progress_bar as bar:
+        task_state = task.state
+        while task_state not in READY_STATES:
+            task = await client.get_task(task.id)
+            task_state = task.state
+            progress = task.progress or 0.0
+            bar(progress)  # pylint: disable=not-callable
+            await asyncio.sleep(polling_interval_s)
+    if task_state in READY_STATES:
+        await _handle_ready(task, client)
+def _format_error(error: TaskError) -> str:
+    stack = StackSummary.from_list(
+        [FrameSummary(f.name, f.lineno, f.name) for f in error.stacktrace]
+    )
+    msg = f"{error.name}:\n{stack}\n{error.message}"
+    if error.cause:
+        msg += "\n cause by {error.cause}"
+    return msg

datashare_python/cli/utils.py ADDED Viewed

@@ -0,0 +1,33 @@
+import asyncio
+import concurrent.futures
+import sys
+from functools import wraps
+import typer
+class AsyncTyper(typer.Typer):
+    def async_command(self, *args, **kwargs):
+        def decorator(async_func):
+            @wraps(async_func)
+            def sync_func(*_args, **_kwargs):
+                res = asyncio.run(async_func(*_args, **_kwargs))
+                return res
+            self.command(*args, **kwargs)(sync_func)
+            return async_func
+        return decorator
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+def _to_concurrent(
+    fut: asyncio.Future, loop: asyncio.AbstractEventLoop
+) -> concurrent.futures.Future:
+    async def wait():
+        await fut
+    return asyncio.run_coroutine_threadsafe(wait(), loop)

datashare_python/config.py ADDED Viewed

@@ -0,0 +1,60 @@
+from typing import ClassVar
+from icij_common.pydantic_utils import ICIJSettings, NoEnumModel
+from icij_worker.utils.logging_ import LogWithWorkerIDMixin
+from pydantic import Field
+import datashare_python
+_ALL_LOGGERS = [datashare_python.__name__]
+class AppConfig(ICIJSettings, LogWithWorkerIDMixin, NoEnumModel):
+    class Config:
+        env_prefix = "DS_DOCKER_ML_"
+    loggers: ClassVar[list[str]] = Field(_ALL_LOGGERS, const=True)
+    log_level: str = Field(default="INFO")
+    batch_size: int = 1024
+    pipeline_batch_size: int = 1024
+    ne_buffer_size: int = 1000
+    # DS
+    ds_api_key: str | None = None
+    ds_url: str = "http://datashare:8080"
+    # ES
+    es_address: str = "http://localhost:9200"
+    es_default_page_size: int = 1000
+    es_keep_alive: str = "10m"
+    es_max_concurrency: int = 5
+    es_max_retries: int = 0
+    es_max_retry_wait_s: int | float = 60
+    es_timeout_s: int | float = 60 * 5
+    def to_es_client(self, address: str | None = None) -> "ESClient":
+        from icij_common.es import ESClient
+        if address is None:
+            address = self.es_address
+        client = ESClient(
+            hosts=[address],
+            pagination=self.es_default_page_size,
+            max_concurrency=self.es_max_concurrency,
+            keep_alive=self.es_keep_alive,
+            timeout=self.es_timeout_s,
+            max_retries=self.es_max_retries,
+            max_retry_wait_s=self.es_max_retry_wait_s,
+            api_key=self.ds_api_key,
+        )
+        client.transport._verified_elasticsearch = (  # pylint: disable=protected-access
+            True
+        )
+        return client
+    def to_task_client(self) -> "DatashareTaskClient":
+        from datashare_python.task_client import DatashareTaskClient
+        return DatashareTaskClient(self.ds_url)

datashare_python/constants.py ADDED Viewed

@@ -0,0 +1,6 @@
+from pathlib import Path
+from icij_worker.app import TaskGroup
+DATA_DIR = Path(__file__).parent.joinpath(".data")
+PYTHON_TASK_GROUP = TaskGroup(name="PYTHON")

datashare_python/objects.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import Self
+import pycountry
+from icij_common.es import DOC_CONTENT, DOC_LANGUAGE, DOC_ROOT_ID, ID_, SOURCE
+from icij_common.pydantic_utils import ICIJModel, LowerCamelCaseModel
+from pydantic import Field
+class Document(LowerCamelCaseModel):
+    id: str
+    root_document: str
+    content: str
+    language: str
+    tags: list[str] = Field(default_factory=list)
+    content_translated: dict[str, str] = Field(
+        default_factory=dict, alias="content_translated"
+    )
+    @classmethod
+    def from_es(cls, es_doc: dict) -> Self:
+        sources = es_doc[SOURCE]
+        return cls(
+            id=es_doc[ID_],
+            content=sources[DOC_CONTENT],
+            content_translated=sources.get("content_translated", dict()),
+            language=sources[DOC_LANGUAGE],
+            root_document=sources[DOC_ROOT_ID],
+            tags=sources.get("tags", []),
+        )
+class ClassificationConfig(ICIJModel):
+    task: str = Field(const=True, default="text-classification")
+    model: str = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
+    batch_size: int = 16
+class TranslationConfig(ICIJModel):
+    task: str = Field(const=True, default="translation")
+    model: str = "Helsinki-NLP/opus-mt"
+    batch_size: int = 16
+    def to_pipeline_args(self, source_language: str, *, target_language: str) -> dict:
+        as_dict = self.dict()
+        source_alpha2 = pycountry.languages.get(name=source_language).alpha_2
+        target_alpha2 = pycountry.languages.get(name=target_language).alpha_2
+        as_dict["task"] = f"translation_{source_alpha2}_to_{target_alpha2}"
+        as_dict["model"] = f"{self.model}-{source_alpha2}-{target_alpha2}"
+        return as_dict

datashare_python/task_client.py ADDED Viewed

@@ -0,0 +1,124 @@
+import uuid
+from typing import Any, Dict, Optional
+from icij_common.pydantic_utils import jsonable_encoder
+from icij_worker import Task, TaskError, TaskState
+from icij_worker.exceptions import UnknownTask
+from icij_worker.utils.http import AiohttpClient
+# TODO: maxRetries is not supported by java, it's automatically set to 3
+_TASK_UNSUPPORTED = {"max_retries"}
+class DatashareTaskClient(AiohttpClient):
+    def __init__(self, datashare_url: str, api_key: str | None = None) -> None:
+        headers = None
+        if api_key is not None:
+            headers = {"Authorization": f"Bearer {api_key}"}
+        super().__init__(datashare_url, headers=headers)
+    async def __aenter__(self):
+        await super().__aenter__()
+        if "Authorization" not in self._headers:
+            async with self._get("/settings") as res:
+                # SimpleCookie doesn't seem to parse DS cookie so we perform some dirty
+                # hack here
+                session_id = [
+                    item
+                    for item in res.headers["Set-Cookie"].split("; ")
+                    if "session_id" in item
+                ]
+                if len(session_id) != 1:
+                    raise ValueError("Invalid cookie")
+                k, v = session_id[0].split("=")
+                self._session.cookie_jar.update_cookies({k: v})
+    async def create_task(
+        self,
+        name: str,
+        args: Dict[str, Any],
+        *,
+        id_: Optional[str] = None,
+        group: Optional[str] = None,
+    ) -> str:
+        if id_ is None:
+            id_ = _generate_task_id(name)
+        task = Task.create(task_id=id_, task_name=name, args=args)
+        task = jsonable_encoder(task, exclude=_TASK_UNSUPPORTED, exclude_unset=True)
+        task.pop("createdAt")
+        url = f"/api/task/{id_}"
+        if group is not None:
+            if not isinstance(group, str):
+                raise TypeError(f"expected group to be a string found {group}")
+            url += f"?group={group}"
+        async with self._put(url, json=task) as res:
+            task_res = await res.json()
+        return task_res["taskId"]
+    async def get_task(self, id_: str) -> Task:
+        url = f"/api/task/{id_}"
+        async with self._get(url) as res:
+            task = await res.json()
+        if task is None:
+            raise UnknownTask(id_)
+        # TODO: align Java on Python here... it's not a good idea to store results
+        #  inside tasks since result can be quite large and we may want to get the task
+        #  metadata without having to deal with the large task results...
+        task = _ds_to_icij_worker_task(task)
+        task = Task(**task)
+        return task
+    async def get_tasks(self) -> list[Task]:
+        url = "/api/task/all"
+        async with self._get(url) as res:
+            tasks = await res.json()
+        # TODO: align Java on Python here... it's not a good idea to store results
+        #  inside tasks since result can be quite large and we may want to get the task
+        #  metadata without having to deal with the large task results...
+        tasks = (_ds_to_icij_worker_task(t) for t in tasks)
+        tasks = [Task(**task) for task in tasks]
+        return tasks
+    async def get_task_state(self, id_: str) -> TaskState:
+        return (await self.get_task(id_)).state
+    async def get_task_result(self, id_: str) -> Any:
+        url = f"/api/task/{id_}/results"
+        async with self._get(url) as res:
+            task_res = await res.json()
+        return task_res
+    async def get_task_error(self, id_: str) -> TaskError:
+        url = f"/api/task/{id_}"
+        async with self._get(url) as res:
+            task = await res.json()
+        if task is None:
+            raise UnknownTask(id_)
+        task_state = TaskState[task["state"]]
+        if task_state != TaskState.ERROR:
+            msg = f"can't find error for task {id_} in state {task_state}"
+            raise ValueError(msg)
+        error = TaskError(**task["error"])
+        return error
+    async def delete(self, id_: str):
+        url = f"/api/task/{id_}"
+        async with self._delete(url):
+            pass
+    async def delete_all_tasks(self):
+        for t in await self.get_tasks():
+            await self.delete(t.id)
+def _generate_task_id(task_name: str) -> str:
+    return f"{task_name}-{uuid.uuid4()}"
+_JAVA_TASK_ATTRIBUTES = ["result", "error"]
+def _ds_to_icij_worker_task(task: dict) -> dict:
+    for k in _JAVA_TASK_ATTRIBUTES:
+        task.pop(k, None)
+    return task

datashare_python/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .classify_docs import classify_docs, create_classification_tasks
2	+ from .translate_docs import translate_docs, create_translation_tasks

datashare_python/tasks/classify_docs.py ADDED Viewed

@@ -0,0 +1,227 @@
+import logging
+from typing import AsyncGenerator, Generator, Iterable, Optional
+import torch
+from elasticsearch._async.helpers import async_bulk
+from icij_common.es import (
+    BOOL,
+    DOC_CONTENT,
+    DOC_CONTENT_TRANSLATED,
+    DOC_LANGUAGE,
+    DOC_ROOT_ID,
+    ESClient,
+    HITS,
+    ID_,
+    MUST_NOT,
+    QUERY,
+    SHOULD,
+    TERM,
+    UPDATE,
+    and_query,
+    bulk_action,
+    has_id,
+)
+from icij_worker.ds_task_client import DatashareTaskClient
+from icij_worker.typing_ import PercentProgress
+from icij_worker.utils.progress import to_raw_progress, to_scaled_progress
+from transformers import Pipeline, pipeline
+from datashare_python.constants import PYTHON_TASK_GROUP
+from datashare_python.objects import ClassificationConfig, Document
+from datashare_python.tasks.dependencies import lifespan_es_client, lifespan_task_client
+from datashare_python.utils import batches
+logger = logging.getLogger(__name__)
+async def create_classification_tasks(
+    *,
+    project: str,
+    language: str,
+    n_workers: int,
+    config: ClassificationConfig | None,
+    es_client: ESClient | None = None,
+    task_client: DatashareTaskClient | None = None,
+    progress: PercentProgress | None = None,
+) -> list[str]:
+    if n_workers < 1:
+        raise ValueError("n_workers must be at least 1")
+    if es_client is None:
+        es_client = lifespan_es_client()
+    if task_client is None:
+        task_client = lifespan_task_client()
+    task_ids = []
+    if config is None:
+        config = ClassificationConfig()
+    # Retrieve unprocessed docs.
+    model = config.model
+    unclassified = _get_unclassified(
+        es_client, project=project, language=language, model=model
+    )
+    unclassified = [d[ID_] async for d in unclassified]
+    n_docs = len(unclassified)
+    if not n_docs:
+        logger.info("found not unclassified documents !")
+        return task_ids
+    logger.info("found %s unclassified documents !", n_docs)
+    fetch_unclassified_progress = 0.5
+    if progress is not None:
+        await progress(fetch_unclassified_progress)
+    # Roughly split the load between workers:
+    # - they should approximately receive the same amount of work
+    # - they should receive tasks which are long enough to avoid model loading overhead
+    # - task should be short enough to avoid starting all over again from scratch in
+    # case of failure
+    n_tasks = max(n_docs // n_workers, n_docs // (n_workers * 5), 1)
+    task_batch_size = n_docs // n_tasks
+    if progress is not None:
+        # We scale the progress to post incremental progress updates from 0 to n_tasks
+        progress = to_scaled_progress(progress, start=fetch_unclassified_progress)
+        progress = to_raw_progress(progress, max_progress=n_tasks)
+    logger.info("creating %s classification tasks...", n_tasks)
+    # We create classification tasks which will be picked up by the workers
+    args = {"project": project, "config": config.dict(), "language": language}
+    for batch in batches(unclassified, task_batch_size):
+        args["docs"] = batch
+        task_id = await task_client.create_task(
+            "classify_docs", args, group=PYTHON_TASK_GROUP.name
+        )
+        task_ids.append(task_id)
+        if progress is not None:
+            await progress(len(task_ids))
+    logger.info("created all classification tasks !")
+    return task_ids
+_CLASSIF_DOC_SOURCES = [DOC_CONTENT, DOC_ROOT_ID, DOC_CONTENT_TRANSLATED, DOC_LANGUAGE]
+async def classify_docs(
+    docs: list[str],
+    *,
+    language: str,
+    project: str,
+    config: ClassificationConfig = ClassificationConfig(),
+    progress: PercentProgress | None = None,
+    es_client: ESClient | None = None,
+) -> int:
+    if es_client is None:
+        es_client = lifespan_es_client()
+    n_docs = len(docs)
+    model = config.model
+    # Torch/macOS silicon stuff
+    device = None
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    # Load the classification pipeline
+    pipe = pipeline(config.task, model=model, device=device)
+    model = pipe.model.name_or_path
+    # Convert the progress to a "raw" progress to update the progress incrementally
+    # from 0 to n_docs (rather than 0.0 to 1.0)
+    progress = to_raw_progress(progress, max_progress=n_docs)
+    seen = 0
+    # We batch the data ourselves, ideally, we should use an async version of:
+    # https://huggingface.co/docs/datasets/v3.1.0/en/package_reference/main_classes#datasets.Dataset.from_generator
+    for batch in batches(docs, batch_size=config.batch_size):
+        batch_length = len(batch)
+        batch_docs = []
+        async for page in es_client.poll_search_pages(
+            body={QUERY: has_id(batch)},
+            _source_includes=_CLASSIF_DOC_SOURCES,
+        ):
+            batch_docs.extend([Document.from_es(doc) for doc in page[HITS][HITS]])
+        contents = (_get_language_content(d, language) for d in batch_docs)
+        batch_docs, contents = zip(
+            *((d, c) for d, c in zip(batch_docs, contents) if c is not None)
+        )
+        batch_docs = tuple(batch_docs)
+        labels = _classify(pipe, list(contents))
+        # We add the classification results by updating the documents with new tags,
+        # this could also be done using: https://github.com/ICIJ/datashare-tarentula
+        await _add_classification_tags(
+            es_client, zip(batch_docs, labels), project, model=model
+        )
+        seen += batch_length
+        if progress is not None:
+            await progress(seen)
+    # Return the number of classified documents
+    return n_docs
+def _classify(pipe: Pipeline, texts: list[str]) -> Generator[str, None, None]:
+    # In practice, we should chunk the text
+    for res in pipe(texts, padding=True, truncation=True):
+        yield res["label"]
+def _get_language_content(doc: Document, language: str) -> Optional[str]:
+    if doc.language == language:
+        return doc.content
+    return doc.content_translated.get(language)
+_SCRIPT_SOURCES = """
+if( !ctx._source.containsKey("tags") ) {
+    ctx._source.tags = [];
+}
+if( !ctx._source.tags.contains(params.tag) ) {
+    ctx._source.tags.add(params.tag);
+}
+"""
+async def _add_classification_tags(
+    es_client: ESClient,
+    tags: Iterable[tuple[Document, str]],
+    project: str,
+    *,
+    model: str,
+):
+    actions = (
+        bulk_action(
+            op_type=UPDATE,
+            index=project,
+            id_=doc.id,
+            routing=doc.root_document,
+            script={
+                "source": _SCRIPT_SOURCES,
+                "lang": "painless",
+                "params": {"tag": f"classified:{model}:{label}"},
+            },
+        )
+        for doc, label in tags
+    )
+    await async_bulk(es_client, actions, raise_on_error=True, refresh="wait_for")
+def _unclassified_query(model: str, language: str):
+    queries = (
+        # Get documents which aren't tagged yet
+        {BOOL: {MUST_NOT: {"prefix": {"tags": {"value": f"classified:{model}:"}}}}},
+        # And which are either in the model language or are translated in the model
+        # language
+        {
+            BOOL: {
+                SHOULD: [
+                    {"exists": {"field": f"{DOC_CONTENT_TRANSLATED}.{language}"}},
+                    {TERM: {DOC_LANGUAGE: language}},
+                ]
+            }
+        },
+    )
+    query = and_query(*queries)
+    return query
+async def _get_unclassified(
+    es_client: ESClient, project: str, *, language: str, model: str, **kwargs
+) -> AsyncGenerator[dict, None]:
+    async for res in es_client.poll_search_pages(
+        index=project,
+        body=_unclassified_query(model, language=language),
+        sort="_doc:asc",
+        _source=False,
+        **kwargs,
+    ):
+        for hit in res[HITS][HITS]:
+            yield hit

datashare_python/tasks/dependencies.py ADDED Viewed

@@ -0,0 +1,110 @@
+import logging
+from icij_common.es import ESClient
+from icij_worker import WorkerConfig
+from icij_worker.ds_task_client import DatashareTaskClient
+from icij_worker.utils.dependencies import DependencyInjectionError
+from datashare_python.config import AppConfig
+logger = logging.getLogger(__name__)
+# Lifespan dependencies consist in global variable which can be loaded in function
+# calling lifespan_<dep_name>(), which returns the global variable.
+# The variable itself is created and setup in <>_setup function and if needed
+# torn down in the <>_teardown function.
+# The setup and tear down functions are registered in the APP_LIFESPAN_DEPS list which
+# is then passed to the AsyncApp when creating it. The app will take care of setup up
+# and tearing down all dependencies in the list. Since a dep might depend on another
+# one, the order in which they are registered is important.
+# We hence start by registering the configuration, other deps are created from it.
+_ASYNC_APP_CONFIG: AppConfig | None = None
+_ES_CLIENT: ESClient | None = None
+_TASK_CLIENT: DatashareTaskClient | None = None
+# App loading setup
+def load_app_config(worker_config: WorkerConfig, **_):
+    global _ASYNC_APP_CONFIG
+    if worker_config.app_bootstrap_config_path is not None:
+        _ASYNC_APP_CONFIG = AppConfig.parse_file(
+            worker_config.app_bootstrap_config_path
+        )
+    else:
+        _ASYNC_APP_CONFIG = AppConfig()
+# Returns the globally injected config
+def lifespan_config() -> AppConfig:
+    if _ASYNC_APP_CONFIG is None:
+        raise DependencyInjectionError("config")
+    return _ASYNC_APP_CONFIG
+# Loggers setup
+def setup_loggers(worker_id: str, **_):
+    config = lifespan_config()
+    config.setup_loggers(worker_id=worker_id)
+    logger.info("worker loggers ready to log 💬")
+    logger.info("app config: %s", config.json(indent=2))
+# Elasticsearch client setup
+async def es_client_setup(**_):
+    # pylint: disable=unnecessary-dunder-call
+    config = lifespan_config()
+    global _ES_CLIENT
+    _ES_CLIENT = config.to_es_client()
+    await _ES_CLIENT.__aenter__()
+# Elasticsearch client teardown
+async def es_client_teardown(exc_type, exc_val, exc_tb):
+    # pylint: disable=unnecessary-dunder-call
+    await lifespan_es_client().__aexit__(exc_type, exc_val, exc_tb)
+    global _ES_CLIENT
+    _ES_CLIENT = None
+# Returns the globally injected ES client
+def lifespan_es_client() -> ESClient:
+    # pylint: disable=unnecessary-dunder-call
+    if _ES_CLIENT is None:
+        raise DependencyInjectionError("es client")
+    return _ES_CLIENT
+# Task client setup
+async def task_client_setup(**_):
+    # pylint: disable=unnecessary-dunder-call
+    config = lifespan_config()
+    global _TASK_CLIENT
+    _TASK_CLIENT = config.to_task_client()
+    await _TASK_CLIENT.__aenter__()
+# Task client teardown
+async def task_client_teardown(exc_type, exc_val, exc_tb):
+    # pylint: disable=unnecessary-dunder-call
+    await lifespan_task_client().__aexit__(exc_type, exc_val, exc_tb)
+    global _TASK_CLIENT
+    _TASK_CLIENT = None
+# Returns the globally injected task client
+def lifespan_task_client() -> DatashareTaskClient:
+    # pylint: disable=unnecessary-dunder-call
+    if _TASK_CLIENT is None:
+        raise DependencyInjectionError("task client")
+    return _TASK_CLIENT
+# Register all dependencies in the format of:
+# (<logging helper>, <dep setup>, <dep teardown>)
+APP_LIFESPAN_DEPS = [
+    ("loading async app configuration", load_app_config, None),
+    ("loggers", setup_loggers, None),
+    ("elasticsearch client", es_client_setup, es_client_teardown),
+    ("task client", task_client_setup, task_client_teardown),
+]

datashare_python/tasks/translate_docs.py ADDED Viewed

@@ -0,0 +1,223 @@
+import logging
+from functools import partial
+from typing import AsyncGenerator, Generator, Iterable
+import torch
+from aiostream.stream import chain
+from elasticsearch._async.helpers import async_bulk
+from icij_common.es import (
+    BOOL,
+    COUNT,
+    DOC_CONTENT,
+    DOC_LANGUAGE,
+    DOC_ROOT_ID,
+    ESClient,
+    HITS,
+    ID_,
+    QUERY,
+    SOURCE,
+    TERM,
+    has_id,
+    must_not,
+)
+from icij_worker.ds_task_client import DatashareTaskClient
+from icij_worker.typing_ import PercentProgress
+from icij_worker.utils.progress import to_raw_progress
+from transformers import Pipeline, pipeline
+from datashare_python.constants import PYTHON_TASK_GROUP
+from datashare_python.objects import Document, TranslationConfig
+from datashare_python.tasks.dependencies import lifespan_es_client, lifespan_task_client
+from datashare_python.utils import async_batches, batches, before_and_after, once
+logger = logging.getLogger(__name__)
+async def create_translation_tasks(
+    *,
+    project: str,
+    target_language: str,
+    config: TranslationConfig | None = None,
+    es_client: ESClient | None = None,
+    task_client: DatashareTaskClient | None = None,
+) -> list[str]:
+    if es_client is None:
+        es_client = lifespan_es_client()
+    if task_client is None:
+        task_client = lifespan_task_client()
+    task_ids = []
+    if config is None:
+        config = TranslationConfig()
+    # Retrieve unprocessed docs.
+    docs_by_language = _untranslated_by_language(
+        es_client, project, target_language=target_language
+    )
+    args = {
+        "project": project,
+        "config": config.dict(),
+        "target_language": target_language,
+    }
+    # We could set this to a smarter value
+    task_batch_size = config.batch_size * 4
+    current_language = None
+    async for language_docs in docs_by_language:
+        async for batch in async_batches(language_docs, task_batch_size):
+            language = batch[0][SOURCE][DOC_LANGUAGE]
+            batch = [doc[ID_] for doc in batch]
+            if language != current_language:
+                logger.info("creating translation task for docs in %s", language)
+            args["docs"] = batch
+            task_id = await task_client.create_task(
+                "translate_docs", args, group=PYTHON_TASK_GROUP.name
+            )
+            task_ids.append(task_id)
+    logger.info("done creating %s translation tasks", len(task_ids))
+    return task_ids
+_TRANSLATION_DOC_SOURCES = [DOC_CONTENT, DOC_ROOT_ID, DOC_LANGUAGE]
+async def translate_docs(
+    docs: list[str],
+    target_language: str,
+    *,
+    project: str,
+    es_client: ESClient | None = None,
+    progress: PercentProgress | None = None,
+    config: TranslationConfig = TranslationConfig(),
+) -> int:
+    if es_client is None:
+        es_client = lifespan_es_client()
+    n_docs = len(docs)
+    if not n_docs:
+        return 0
+    # Torch/macOS silicon stuff
+    device = None
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    seen = 0
+    # Convert the progress to a "raw" progress to update the progress incrementally
+    # rather than setting the progress rate
+    progress = to_raw_progress(progress, max_progress=n_docs)
+    pipe = None
+    # We batch the data ourselves, ideally, we should use an async version of:
+    # https://huggingface.co/docs/datasets/v3.1.0/en/package_reference/main_classes#datasets.Dataset.from_generator
+    for batch in batches(docs, batch_size=config.batch_size):
+        batch_docs = []
+        async for page in es_client.poll_search_pages(
+            body={QUERY: has_id(batch)},
+            _source_includes=_TRANSLATION_DOC_SOURCES,
+        ):
+            batch_docs.extend((Document.from_es(doc) for doc in page[HITS][HITS]))
+        if pipe is None:
+            source_language = batch_docs[0].language
+            kwargs = config.to_pipeline_args(
+                source_language, target_language=target_language
+            )
+            pipe = pipeline(device=device, **kwargs)
+        # Load the classification pipeline
+        contents = [d.content for d in batch_docs]
+        translations = _translate(pipe, contents)
+        await _add_translation(
+            es_client,
+            zip(batch_docs, translations),
+            project,
+            target_language=target_language,
+        )
+        seen += len(batch)
+        if progress is not None:
+            await progress(seen)
+    # Return the number of classified documents
+    return n_docs
+def _translate(pipe: Pipeline, texts: list[str]) -> Generator[str, None, None]:
+    for res in pipe(texts):
+        yield res["translation_text"]
+def _has_language(doc: dict, language: str) -> bool:
+    return doc[SOURCE][DOC_LANGUAGE] == language
+async def _untranslated_by_language(
+    es_client: ESClient, project: str, target_language: str
+) -> AsyncGenerator[AsyncGenerator[list[str], None], None]:
+    docs = _get_untranslated(es_client, project, target_language=target_language)
+    while True:
+        try:
+            next_doc = await anext(aiter(docs))
+        except StopAsyncIteration:
+            return
+        current_language = next_doc[SOURCE][DOC_LANGUAGE]
+        language_docs, docs = before_and_after(
+            docs, partial(_has_language, language=current_language)
+        )
+        yield chain(once(next_doc), language_docs)
+_SCRIPT_SOURCES = """
+if( !ctx._source.containsKey("content_translated") ) {
+    ctx._source.content_translated = new HashMap();
+}
+ctx._source.content_translated[params.language] = params.translation;
+"""
+async def _add_translation(
+    es_client: ESClient,
+    translations: Iterable[tuple[Document, str]],
+    project: str,
+    *,
+    target_language: str,
+):
+    actions = (
+        {
+            "_op_type": "update",
+            "_index": project,
+            "_routing": doc.root_document,
+            ID_: doc.id,
+            "script": {
+                "source": _SCRIPT_SOURCES,
+                "lang": "painless",
+                "params": {"language": target_language, "translation": translation},
+            },
+        }
+        for doc, translation in translations
+    )
+    await async_bulk(es_client, actions, raise_on_error=True, refresh="wait_for")
+def _untranslated_query(target_language: str):
+    query = {
+        "query": {
+            BOOL: must_not(
+                {"exists": {"field": f"content_translated.{target_language}"}},
+                {TERM: {DOC_LANGUAGE: target_language}},
+            )
+        }
+    }
+    return query
+async def _get_untranslated(
+    es_client: ESClient, project: str, *, target_language: str
+) -> AsyncGenerator[dict, None]:
+    async for res in es_client.poll_search_pages(
+        index=project,
+        body=_untranslated_query(target_language),
+        _source_includes=[DOC_LANGUAGE],
+        sort=[f"{DOC_LANGUAGE}:asc", "_doc:asc"],
+    ):
+        for hit in res[HITS][HITS]:
+            yield hit
+async def _count_untranslated(
+    es_client: ESClient, project: str, *, target_language: str
+) -> int:
+    res = await es_client.count(
+        index=project, body=_untranslated_query(target_language)
+    )
+    return res[COUNT]

datashare_python/utils.py ADDED Viewed

@@ -0,0 +1,69 @@
+import asyncio
+import inspect
+from itertools import islice
+from typing import AsyncIterable, AsyncIterator, Awaitable, Callable, Iterable, TypeVar
+T = TypeVar("T")
+Predicate = Callable[[T], bool] | Callable[[T], Awaitable[bool]]
+async def async_batches(
+    iterable: AsyncIterable[T], batch_size: int
+) -> AsyncIterator[tuple[T]]:
+    it = aiter(iterable)
+    if batch_size < 1:
+        raise ValueError("n must be at least one")
+    while True:
+        batch = []
+        while len(batch) < batch_size:
+            try:
+                batch.append(await anext(it))
+            except StopAsyncIteration:
+                if batch:
+                    yield tuple(batch)
+                return
+        yield tuple(batch)
+def batches(iterable: Iterable[T], batch_size: int):
+    if batch_size < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, batch_size)):
+        yield batch
+async def maybe_await(maybe_awaitable: Awaitable[T] | T) -> T:
+    if inspect.isawaitable(maybe_awaitable):
+        return await maybe_awaitable
+    return maybe_awaitable
+async def once(item: T) -> AsyncIterator[T]:
+    yield item
+def before_and_after(
+    iterable: AsyncIterable[T], predicate: Predicate[T]
+) -> tuple[AsyncIterable[T], AsyncIterable[T]]:
+    transition = asyncio.get_event_loop().create_future()
+    async def true_iterator():
+        async for elem in iterable:
+            if await maybe_await(predicate(elem)):
+                yield elem
+            else:
+                transition.set_result(elem)
+                return
+        transition.set_exception(StopAsyncIteration)
+    async def remainder_iterator():
+        try:
+            yield await transition
+        except StopAsyncIteration:
+            return
+        async for elm in iterable:
+            yield elm
+    return true_iterator(), remainder_iterator()

datashare_python-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,80 @@
+Metadata-Version: 2.1
+Name: datashare-python
+Version: 0.1.0
+Summary: Implement Datashare task in Python
+Author-Email: =?utf-8?q?Cl=C3=A9ment_Doumouro?= <cdoumouro@icij.org>, =?utf-8?q?Cl=C3=A9ment_Doumouro?= <clement.doumouro@gmail.com>
+Requires-Python: ~=3.11
+Requires-Dist: aiostream~=0.6.4
+Requires-Dist: aiohttp~=3.11.9
+Requires-Dist: icij-common[elasticsearch]~=0.5.5
+Requires-Dist: icij-worker[amqp]~=0.12
+Requires-Dist: torch==2.6.0.dev20241101; sys_platform != "darwin"
+Requires-Dist: torch!=2.6.0.dev20241101+cpu,<=2.6.0.dev20241101; sys_platform == "darwin"
+Requires-Dist: transformers~=4.46.3
+Requires-Dist: pycountry>=24.6.1
+Requires-Dist: sentencepiece>=0.2.0
+Requires-Dist: typer>=0.13.1
+Requires-Dist: alive-progress>=3.2.0
+Description-Content-Type: text/markdown
+<div style="background-image: linear-gradient(45deg, #193d87, #fa4070);">
+  <br/>
+  <p align="center">
+    <a href="https://datashare.icij.org/">
+      <img align="center" src="docs/assets/datashare-logo.svg" alt="Datashare" style="max-width: 60%">
+    </a>
+  </p>
+  <p align="center">
+    <em>Better analyze information, in all its forms</em>
+  </p>
+  <br/>
+</div>
+<br/>
+---
+**Documentation**: <a href="https://icij.github.io/datashare-python" target="_blank">https://icij.github.io/datashare-python</a>
+---
+# Implement **your own Datashare tasks**, written in Python
+Most AI, Machine Learning, Data Engineering happens in Python.
+[Datashare](https://icij.gitbook.io/datashare) now lets you extend its backend with your own tasks implemented in Python.
+Turning your own ML pipelines into Datashare tasks is **very simple**, learn about it inside [documentation](https://icij.github.io/datashare-python).
+Turning your own ML pipelines into Datashare tasks is **very simple**.
+Actually, it's *almost* as simple as cloning our [template repo](https://github.com/ICIJ/datashare-python):
+```
+$ git clone git@github.com:ICIJ/datashare-python.git
+```
+replacing existing [app](https://github.com/ICIJ/datashare-python/blob/main/datashare_python/app.py) tasks with your own:
+```python
+from icij_worker import AsyncApp
+app = AsyncApp("app")
+@app.task
+def hello_world() -> str:
+    return "Hello world"
+```
+installing [`uv`](https://docs.astral.sh/uv/) to set up dependencies and running your async Datashare worker:
+```console
+$ cd datashare-python
+$ curl -LsSf https://astral.sh/uv/install.sh | sh
+$ uv run ./scripts/worker_entrypoint.sh
+[INFO][icij_worker.backend.backend]: Loading worker configuration from env...
+...
+}
+[INFO][icij_worker.backend.mp]: starting 1 worker for app datashare_python.app.app
+...
+```
+you'll then be able to execute task by starting using our [HTTP client]() (and soon using Datashare's UI).
+## Learn more reading our [documentation](https://icij.github.io/datashare-python) !

datashare_python-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+datashare_python-0.1.0.dist-info/METADATA,sha256=xt9hC3iIosOo3sEeePvELBAIYx-yg7b80wB9AewYlzs,2796
+datashare_python-0.1.0.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
+datashare_python-0.1.0.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
+datashare_python/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+datashare_python/__main__.py,sha256=agVB_DGJFNXl4AVVwb14i6fF8AXzyAXp8ykOGRPtB3A,83
+datashare_python/app.py,sha256=QI8fVR6s6YU7S45s3hTGM7DqdISA4T7XnXU4UYW9-0E,2582
+datashare_python/cli/__init__.py,sha256=wcNr-Tp5YmJCYAx4rBTq5EMaoNcPB-9On8LDcopg5v4,818
+datashare_python/cli/tasks.py,sha256=rxYlDs0WWe1-bBtGshw4TxZ14u-uaeOTsx4DhZX0Ayo,5870
+datashare_python/cli/utils.py,sha256=vykjfBBYW5hZVWn7eR3YyBb5NcV35mO7k_wIB5UNJiA,772
+datashare_python/config.py,sha256=vsNExCz8J8mBUm7y-6FxGKzRVh5ULnlu3k9015lBXOI,1813
+datashare_python/constants.py,sha256=naesOTqywwIN5IIZH3GLBcst7qLOvcQZLwxeZZ_IBI0,161
+datashare_python/objects.py,sha256=ThFhy0XuVUYx_A47wJmFy3eID1kbSDcoxTiRbK3RrEo,1708
+datashare_python/task_client.py,sha256=sVKFg7k3q7JjMEOObcdo4f7vCF72kbhPq5A4lGbu2qA,4528
+datashare_python/tasks/__init__.py,sha256=xhlmAx5SMJ14kJQEZj8jpQU7_1lnXYhFhr7xpChtclQ,139
+datashare_python/tasks/classify_docs.py,sha256=I1y6SyrRb_zcppUu_RmhepXjg00OlLbJ75Exw2BYNls,7764
+datashare_python/tasks/dependencies.py,sha256=Niua0YrqIVgXTIfEghwaJmRia1XDGkiBVl__Hge8fFk,3661
+datashare_python/tasks/translate_docs.py,sha256=Qu-b1qZUj18PTI-rsr3oSCGTHLYhNzf0ip9UC7G-CJY,7142
+datashare_python/utils.py,sha256=y-ZoHt-1v-18o7EoCXD2fMYUxwrz7mZLgbzbggkm_Vc,1901
+datashare_python-0.1.0.dist-info/RECORD,,

datashare_python-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: pdm-backend (2.4.3)
+Root-Is-Purelib: true
+Tag: py3-none-any

datashare_python-0.1.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,4 @@
+[console_scripts]
+[gui_scripts]