amsdal_ml 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.github/workflows/ci.yml +18 -3
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.gitignore +2 -0
- amsdal_ml-0.2.0/CLAUDE.md +171 -0
- amsdal_ml-0.2.0/PKG-INFO +293 -0
- amsdal_ml-0.2.0/README.md +275 -0
- amsdal_ml-0.2.0/RELEASE.md +180 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md +617 -0
- amsdal_ml-0.2.0/amsdal_ml/__about__.py +1 -0
- amsdal_ml-0.2.0/amsdal_ml/agents/__init__.py +13 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/agents/agent.py +5 -7
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/agents/default_qa_agent.py +108 -143
- amsdal_ml-0.2.0/amsdal_ml/agents/functional_calling_agent.py +233 -0
- amsdal_ml-0.2.0/amsdal_ml/agents/mcp_client_tool.py +46 -0
- amsdal_ml-0.2.0/amsdal_ml/agents/python_tool.py +86 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/agents/retriever_tool.py +17 -8
- amsdal_ml-0.2.0/amsdal_ml/agents/tool_adapters.py +98 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/fileio/base_loader.py +7 -5
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/fileio/openai_loader.py +16 -17
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_client/base.py +2 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_client/http_client.py +7 -1
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_client/stdio_client.py +21 -18
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_server/server_retriever_stdio.py +8 -11
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/__init__.py +29 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/default_ingesting.py +49 -51
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/embedders/__init__.py +4 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/embedders/embedder.py +12 -0
- amsdal_ml-0.1.3/amsdal_ml/ml_retrievers/openai_retriever.py → amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/embedders/openai_embedder.py +6 -15
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/embedding_data.py +3 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/__init__.py +6 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/folder_loader.py +52 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/loader.py +28 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/pdf_loader.py +136 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/text_loader.py +44 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/model_ingester.py +278 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/pipeline.py +131 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/pipeline_interface.py +31 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/processors/__init__.py +4 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/processors/cleaner.py +14 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/processors/text_cleaner.py +42 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/splitters/__init__.py +4 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/splitters/splitter.py +15 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/splitters/token_splitter.py +85 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/stores/__init__.py +4 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/stores/embedding_data.py +63 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/stores/store.py +22 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/types.py +40 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_models/models.py +179 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_models/openai_model.py +679 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_models/utils.py +7 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/__init__.py +17 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/adapters.py +93 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_retrievers/default_retriever.py +11 -1
- amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/openai_retriever.py +59 -0
- amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/query_retriever.py +487 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_retrievers/retriever.py +12 -0
- amsdal_ml-0.2.0/amsdal_ml/models/embedding_model.py +21 -0
- amsdal_ml-0.2.0/amsdal_ml/prompts/__init__.py +77 -0
- amsdal_ml-0.2.0/amsdal_ml/prompts/database_query_agent.prompt +14 -0
- amsdal_ml-0.2.0/amsdal_ml/prompts/functional_calling_agent_base.prompt +9 -0
- amsdal_ml-0.2.0/amsdal_ml/prompts/nl_query_filter.prompt +318 -0
- {amsdal_ml-0.1.3/amsdal_ml/agents/promts → amsdal_ml-0.2.0/amsdal_ml/prompts}/react_chat.prompt +17 -8
- amsdal_ml-0.2.0/amsdal_ml/utils/__init__.py +5 -0
- amsdal_ml-0.2.0/amsdal_ml/utils/query_utils.py +189 -0
- amsdal_ml-0.2.0/change-logs.md +49 -0
- amsdal_ml-0.2.0/docker-compose.tests.yml +16 -0
- amsdal_ml-0.2.0/latest-changelogs.md +14 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/pyproject.toml +7 -3
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_arun.py +2 -2
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_astream.py +7 -7
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_astream_final_only.py +5 -5
- amsdal_ml-0.2.0/tests/agents_tests/test_fakes.py +272 -0
- amsdal_ml-0.2.0/tests/agents_tests/test_functional_calling_agent.py +241 -0
- amsdal_ml-0.2.0/tests/agents_tests/test_qa_agent_with_nlq_tool.py +268 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_tool_call_arguments_async.py +16 -6
- amsdal_ml-0.2.0/tests/conftest.py +220 -0
- amsdal_ml-0.2.0/tests/fixtures/models/author.py +15 -0
- amsdal_ml-0.2.0/tests/fixtures/models/book.py +16 -0
- amsdal_ml-0.2.0/tests/fixtures/models/category.py +16 -0
- amsdal_ml-0.2.0/tests/fixtures/models/order.py +18 -0
- amsdal_ml-0.2.0/tests/fixtures/models/product.py +18 -0
- amsdal_ml-0.2.0/tests/fixtures/models/user.py +18 -0
- amsdal_ml-0.2.0/tests/fixtures/models/vehicle.py +32 -0
- amsdal_ml-0.2.0/tests/ingesting/test_folder_loader_and_rag.py +120 -0
- amsdal_ml-0.2.0/tests/ingesting/test_ingestion_components.py +232 -0
- amsdal_ml-0.2.0/tests/ingesting/test_model_ingester.py +140 -0
- amsdal_ml-0.2.0/tests/nlqueryretriever_tests/mock_tests/__init__.py +0 -0
- amsdal_ml-0.2.0/tests/nlqueryretriever_tests/mock_tests/test_retriever_mock.py +651 -0
- amsdal_ml-0.2.0/tests/nlqueryretriever_tests/models_tests/__init__.py +0 -0
- amsdal_ml-0.2.0/tests/nlqueryretriever_tests/models_tests/test_nl_query_models.py +305 -0
- amsdal_ml-0.2.0/tests/nlqueryretriever_tests/schema_tests/__init__.py +0 -0
- amsdal_ml-0.2.0/tests/nlqueryretriever_tests/schema_tests/test_comprehensive_schema.py +348 -0
- amsdal_ml-0.2.0/tests/nlqueryretriever_tests/schema_tests/test_nested_list_skipping.py +135 -0
- amsdal_ml-0.2.0/tests/test_files/.gitkeep +0 -0
- amsdal_ml-0.2.0/tests/test_files/pdf/Aspida.pdf +0 -0
- amsdal_ml-0.2.0/tests/test_files/pdf/Nassau.pdf +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/test_openai_model.py +64 -2
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/uv.lock +1030 -768
- amsdal_ml-0.1.3/PKG-INFO +0 -69
- amsdal_ml-0.1.3/README.md +0 -52
- amsdal_ml-0.1.3/amsdal_ml/__about__.py +0 -1
- amsdal_ml-0.1.3/amsdal_ml/agents/promts/__init__.py +0 -58
- amsdal_ml-0.1.3/amsdal_ml/ml_models/models.py +0 -87
- amsdal_ml-0.1.3/amsdal_ml/ml_models/openai_model.py +0 -371
- amsdal_ml-0.1.3/amsdal_ml/models/embedding_model.py +0 -21
- amsdal_ml-0.1.3/change-logs.md +0 -24
- amsdal_ml-0.1.3/latest-changelogs.md +0 -6
- amsdal_ml-0.1.3/tests/agents_tests/test_fakes.py +0 -173
- amsdal_ml-0.1.3/tests/conftest.py +0 -105
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal/.dependencies +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal/.environment +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal/.secrets +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal-cli +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.github/workflows/release.yml +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.github/workflows/tag_check.yml +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/__init__.py +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/app.py +0 -0
- {amsdal_ml-0.1.3/amsdal_ml/agents → amsdal_ml-0.2.0/amsdal_ml/fileio}/__init__.py +0 -0
- {amsdal_ml-0.1.3/amsdal_ml/fileio → amsdal_ml-0.2.0/amsdal_ml/mcp_client}/__init__.py +0 -0
- {amsdal_ml-0.1.3/amsdal_ml/mcp_client → amsdal_ml-0.2.0/amsdal_ml/mcp_server}/__init__.py +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/migrations/0000_initial.py +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_config.py +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/ingesting.py +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/openai_ingesting.py +0 -0
- {amsdal_ml-0.1.3/amsdal_ml/mcp_server → amsdal_ml-0.2.0/amsdal_ml/ml_models}/__init__.py +0 -0
- {amsdal_ml-0.1.3/amsdal_ml/ml_ingesting → amsdal_ml-0.2.0/amsdal_ml/models}/__init__.py +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/py.typed +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/config.yml +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/license_check.py +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/scripts/release.sh +0 -0
- {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/scripts/tag_check.sh +0 -0
- {amsdal_ml-0.1.3/amsdal_ml/ml_models → amsdal_ml-0.2.0/tests}/__init__.py +0 -0
- {amsdal_ml-0.1.3/amsdal_ml/ml_retrievers → amsdal_ml-0.2.0/tests/agents_tests}/__init__.py +0 -0
- {amsdal_ml-0.1.3/amsdal_ml → amsdal_ml-0.2.0/tests/fixtures}/models/__init__.py +0 -0
- {amsdal_ml-0.1.3/tests → amsdal_ml-0.2.0/tests/ingesting}/__init__.py +0 -0
- {amsdal_ml-0.1.3/tests/agents_tests → amsdal_ml-0.2.0/tests/nlqueryretriever_tests}/__init__.py +0 -0
|
@@ -23,7 +23,7 @@ jobs:
|
|
|
23
23
|
python license_check.py
|
|
24
24
|
|
|
25
25
|
test-lint:
|
|
26
|
-
name: Run tests and check style
|
|
26
|
+
name: Run tests and check style (Python ${{ matrix.python-version }}, ${{ matrix.database-backend }})
|
|
27
27
|
needs: [license-check]
|
|
28
28
|
runs-on: self-hosted
|
|
29
29
|
strategy:
|
|
@@ -31,6 +31,21 @@ jobs:
|
|
|
31
31
|
fail-fast: false
|
|
32
32
|
matrix:
|
|
33
33
|
python-version: ["3.11", "3.12"]
|
|
34
|
+
database-backend: ["sqlite", "postgres"]
|
|
35
|
+
services:
|
|
36
|
+
postgres:
|
|
37
|
+
image: pgvector/pgvector:pg16
|
|
38
|
+
env:
|
|
39
|
+
POSTGRES_USER: postgres
|
|
40
|
+
POSTGRES_PASSWORD: example
|
|
41
|
+
POSTGRES_DB: postgres
|
|
42
|
+
ports:
|
|
43
|
+
- 5432:5432
|
|
44
|
+
options: >-
|
|
45
|
+
--health-cmd pg_isready
|
|
46
|
+
--health-interval 10s
|
|
47
|
+
--health-timeout 5s
|
|
48
|
+
--health-retries 5
|
|
34
49
|
env:
|
|
35
50
|
PYTHON: ${{ matrix.python-version }}
|
|
36
51
|
DEPS: yes
|
|
@@ -54,9 +69,9 @@ jobs:
|
|
|
54
69
|
hatch run sync
|
|
55
70
|
|
|
56
71
|
- name: Run style checks
|
|
57
|
-
if: always()
|
|
72
|
+
if: always() && matrix.database-backend == 'sqlite'
|
|
58
73
|
run: hatch run all
|
|
59
74
|
|
|
60
75
|
- name: Run tests
|
|
61
76
|
if: always()
|
|
62
|
-
run: hatch run cov tests/
|
|
77
|
+
run: hatch run cov tests/ -- --database_backend=${{ matrix.database-backend }}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
amsdal-ml is a machine learning plugin for the AMSDAL Framework that provides embeddings, vector search, and AI-driven features. It supports both synchronous and asynchronous modes, with primary focus on async operations using OpenAI models.
|
|
8
|
+
|
|
9
|
+
## Development Commands
|
|
10
|
+
|
|
11
|
+
### Environment Setup
|
|
12
|
+
```bash
|
|
13
|
+
# Install dependencies using hatch/uv
|
|
14
|
+
pip install --upgrade uv hatch==1.14.2
|
|
15
|
+
hatch env create
|
|
16
|
+
hatch run sync
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Testing
|
|
20
|
+
```bash
|
|
21
|
+
# Run all tests with coverage
|
|
22
|
+
hatch run cov
|
|
23
|
+
|
|
24
|
+
# Run specific test file
|
|
25
|
+
hatch run test tests/test_openai_model.py
|
|
26
|
+
|
|
27
|
+
# Run tests with pytest directly (after env setup)
|
|
28
|
+
pytest tests/
|
|
29
|
+
pytest tests/agents_tests/ # Run agent-specific tests
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Code Quality
|
|
33
|
+
```bash
|
|
34
|
+
# Run all checks (style + typing)
|
|
35
|
+
hatch run all
|
|
36
|
+
|
|
37
|
+
# Style checks only
|
|
38
|
+
hatch run style
|
|
39
|
+
|
|
40
|
+
# Format code (fix style issues)
|
|
41
|
+
hatch run fmt
|
|
42
|
+
|
|
43
|
+
# Type checking
|
|
44
|
+
hatch run typing
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Dependency Management
|
|
48
|
+
```bash
|
|
49
|
+
# Sync dependencies
|
|
50
|
+
hatch run sync
|
|
51
|
+
|
|
52
|
+
# Update lock file
|
|
53
|
+
hatch run lock
|
|
54
|
+
|
|
55
|
+
# Upgrade all dependencies
|
|
56
|
+
hatch run lock-upgrade
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### AMSDAL CLI Commands
|
|
60
|
+
```bash
|
|
61
|
+
# Generate new model
|
|
62
|
+
amsdal generate model ModelName --format py
|
|
63
|
+
|
|
64
|
+
# Generate property for model
|
|
65
|
+
amsdal generate property --model ModelName property_name
|
|
66
|
+
|
|
67
|
+
# Generate transaction
|
|
68
|
+
amsdal generate transaction TransactionName
|
|
69
|
+
|
|
70
|
+
# Generate hook
|
|
71
|
+
amsdal generate hook --model ModelName on_create
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Architecture
|
|
75
|
+
|
|
76
|
+
### Core Components
|
|
77
|
+
|
|
78
|
+
**ML Models** (`amsdal_ml/ml_models/`)
|
|
79
|
+
- Abstract base class `MLModel` defines the interface for all ML models
|
|
80
|
+
- Supports both sync/async invoke and streaming methods
|
|
81
|
+
- Primary implementation uses OpenAI API
|
|
82
|
+
- All models must implement `setup()`, `teardown()`, `invoke()`, `ainvoke()`, `stream()`, and `astream()`
|
|
83
|
+
- Custom error hierarchy: `ModelError`, `ModelConnectionError`, `ModelRateLimitError`, `ModelAPIError`
|
|
84
|
+
|
|
85
|
+
**ML Ingesting** (`amsdal_ml/ml_ingesting/`)
|
|
86
|
+
- `MLIngesting` abstract base handles text generation and embedding creation from data
|
|
87
|
+
- Creates `EmbeddingData` records that link embeddings to source objects
|
|
88
|
+
- Supports chunk-based processing with configurable depth and token limits
|
|
89
|
+
- Both sync/async methods for text generation and embedding
|
|
90
|
+
|
|
91
|
+
**ML Retrievers** (`amsdal_ml/ml_retrievers/`)
|
|
92
|
+
- `MLRetriever` provides semantic search via similarity_search/asimilarity_search
|
|
93
|
+
- Returns `RetrievalChunk` objects with object metadata, chunk text, distance, and tags
|
|
94
|
+
- Supports filtering by include/exclude tags
|
|
95
|
+
- Configurable k parameter for number of results
|
|
96
|
+
|
|
97
|
+
**Agents** (`amsdal_ml/agents/`)
|
|
98
|
+
- Abstract `Agent` base class for Q&A and task-oriented agents
|
|
99
|
+
- Async-first design (sync methods raise NotImplementedError)
|
|
100
|
+
- Returns `AgentOutput` with answer, used_tools, and citations
|
|
101
|
+
- Supports streaming responses via `astream()`
|
|
102
|
+
- File attachments supported through `FileAttachment` interface
|
|
103
|
+
|
|
104
|
+
**MCP Integration**
|
|
105
|
+
- **Server** (`amsdal_ml/mcp_server/`): Exposes retriever search as MCP tool via stdio
|
|
106
|
+
- **Client** (`amsdal_ml/mcp_client/`): Supports both stdio and HTTP transports for calling MCP tools
|
|
107
|
+
- Server accepts base64-encoded AMSDAL config for initialization
|
|
108
|
+
|
|
109
|
+
**File I/O** (`amsdal_ml/fileio/`)
|
|
110
|
+
- `BaseFileLoader` abstract class for uploading files to ML providers
|
|
111
|
+
- `FileAttachment` represents processed attachments (types: PLAIN_TEXT, FILE_ID)
|
|
112
|
+
- `FileItem` helper for creating attachments from paths, bytes, or strings
|
|
113
|
+
|
|
114
|
+
### Data Models
|
|
115
|
+
|
|
116
|
+
**EmbeddingModel** (`amsdal_ml/models/embedding_model.py`)
|
|
117
|
+
- Core model storing embeddings in database
|
|
118
|
+
- Links to source object via `data_object_class` and `data_object_id`
|
|
119
|
+
- Stores 1536-dimensional vectors (OpenAI text-embedding-3-small default)
|
|
120
|
+
- Includes chunk_index, raw_text, tags, and ml_metadata fields
|
|
121
|
+
|
|
122
|
+
### Configuration
|
|
123
|
+
|
|
124
|
+
**MLConfig** (`amsdal_ml/ml_config.py`)
|
|
125
|
+
- Loaded from `.env` file using pydantic-settings
|
|
126
|
+
- Key settings:
|
|
127
|
+
- `ml_model_class`: Path to ML model implementation
|
|
128
|
+
- `ml_retriever_class`: Path to retriever implementation
|
|
129
|
+
- `ml_ingesting_class`: Path to ingesting implementation
|
|
130
|
+
- `llm_model_name`: Default 'gpt-4o'
|
|
131
|
+
- `embed_model_name`: Default 'text-embedding-3-small'
|
|
132
|
+
- `embed_max_depth`, `embed_max_chunks`, `embed_max_tokens_per_chunk`: Chunking parameters
|
|
133
|
+
- `retriever_default_k`: Number of results for similarity search
|
|
134
|
+
- `openai_api_key`, `claude_api_key`: API credentials
|
|
135
|
+
- `embedding_targets`: List of models to embed
|
|
136
|
+
|
|
137
|
+
**Database Config** (`config.yml`)
|
|
138
|
+
- Defines AMSDAL connections (sqlite_history, sqlite_state, lock)
|
|
139
|
+
- Resources config maps repository and lakehouse to connections
|
|
140
|
+
- Set `async_mode: true` for async operations
|
|
141
|
+
|
|
142
|
+
## Code Style
|
|
143
|
+
|
|
144
|
+
- Python 3.11+ required
|
|
145
|
+
- Uses Ruff for linting and formatting with 120-char line length
|
|
146
|
+
- Single quotes enforced (`quote-style = "single"`)
|
|
147
|
+
- Import ordering: force-single-line with order-by-type
|
|
148
|
+
- Type checking via mypy with strict settings (disallow_any_generics, check_untyped_defs)
|
|
149
|
+
- Excludes migrations directory from linting
|
|
150
|
+
|
|
151
|
+
## Testing
|
|
152
|
+
|
|
153
|
+
- Uses pytest with pytest-asyncio for async tests
|
|
154
|
+
- Test fixtures in `tests/conftest.py` provide mocked OpenAI clients
|
|
155
|
+
- `OPENAI_API_KEY` set to dummy value in tests via fixture
|
|
156
|
+
- Coverage tracking with coverage.py
|
|
157
|
+
|
|
158
|
+
## CI/CD
|
|
159
|
+
|
|
160
|
+
The project uses self-hosted runners with two jobs:
|
|
161
|
+
1. **license-check**: Validates third-party licenses using `license_check.py`
|
|
162
|
+
2. **test-lint**: Runs on Python 3.11 and 3.12, executes `hatch run all` (style+typing) and `hatch run cov`
|
|
163
|
+
|
|
164
|
+
## Key Patterns
|
|
165
|
+
|
|
166
|
+
1. **Async-First**: Most components prioritize async methods; sync methods often raise NotImplementedError
|
|
167
|
+
2. **Abstract Base Classes**: Heavy use of ABCs to define interfaces for models, retrievers, ingesters, and agents
|
|
168
|
+
3. **Configuration via Pydantic**: Settings loaded from environment with type validation
|
|
169
|
+
4. **AMSDAL Integration**: Uses AMSDAL's model system, manager, and connection framework
|
|
170
|
+
5. **Chunking Strategy**: Text split into chunks with metadata preservation for better embedding quality
|
|
171
|
+
6. **Tag-Based Filtering**: Embeddings tagged for fine-grained retrieval control
|
amsdal_ml-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: amsdal_ml
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: amsdal_ml plugin for AMSDAL Framework
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: aiohttp==3.12.15
|
|
7
|
+
Requires-Dist: amsdal-cli>=0.5.7
|
|
8
|
+
Requires-Dist: amsdal-data>=0.5.9
|
|
9
|
+
Requires-Dist: amsdal-models>=0.5.9
|
|
10
|
+
Requires-Dist: amsdal-utils>=0.5.4
|
|
11
|
+
Requires-Dist: amsdal>=0.5.6
|
|
12
|
+
Requires-Dist: mcp>=0.1
|
|
13
|
+
Requires-Dist: openai==1.100.2
|
|
14
|
+
Requires-Dist: pydantic-settings==2.10.1
|
|
15
|
+
Requires-Dist: pydantic==2.11.7
|
|
16
|
+
Requires-Dist: pymupdf>=1.24.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# AMSDAL ML
|
|
20
|
+
|
|
21
|
+
[](https://github.com/amsdal/amsdal_ml/actions/workflows/ci.yml)
|
|
22
|
+
[](https://www.python.org/downloads/)
|
|
23
|
+
|
|
24
|
+
Machine learning plugin for the AMSDAL Framework, providing embeddings, vector search, semantic retrieval, and AI agents with support for OpenAI models.
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
|
|
28
|
+
- **Vector Embeddings**: Generate and store embeddings for any AMSDAL model with automatic chunking
|
|
29
|
+
- **Semantic Search**: Query your data using natural language with tag-based filtering
|
|
30
|
+
- **AI Agents**: Build Q&A systems with streaming support and citation tracking
|
|
31
|
+
- **Async-First**: Optimized for high-performance async operations
|
|
32
|
+
- **MCP Integration**: Expose and consume tools via Model Context Protocol (stdio/HTTP)
|
|
33
|
+
- **File Attachments**: Process and embed documents with built-in loaders
|
|
34
|
+
- **Extensible**: Abstract base classes for custom models, retrievers, and ingesters
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install amsdal-ml
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Requirements
|
|
43
|
+
|
|
44
|
+
- Python 3.11 or higher
|
|
45
|
+
- AMSDAL Framework 0.5.6+
|
|
46
|
+
- OpenAI API key (for default implementations)
|
|
47
|
+
|
|
48
|
+
## Quick Start
|
|
49
|
+
|
|
50
|
+
### 1. Configuration
|
|
51
|
+
|
|
52
|
+
Create a `.env` file in your project root:
|
|
53
|
+
|
|
54
|
+
```env
|
|
55
|
+
OPENAI_API_KEY=sk-your-api-key-here
|
|
56
|
+
async_mode=true
|
|
57
|
+
ml_model_class=amsdal_ml.ml_models.openai_model.OpenAIModel
|
|
58
|
+
ml_retriever_class=amsdal_ml.ml_retrievers.openai_retriever.OpenAIRetriever
|
|
59
|
+
ml_ingesting_class=amsdal_ml.ml_ingesting.openai_ingesting.OpenAIIngesting
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Create a `config.yml` for AMSDAL connections:
|
|
63
|
+
|
|
64
|
+
```yaml
|
|
65
|
+
application_name: my-ml-app
|
|
66
|
+
async_mode: true
|
|
67
|
+
connections:
|
|
68
|
+
- name: sqlite_state
|
|
69
|
+
backend: sqlite-state-async
|
|
70
|
+
credentials:
|
|
71
|
+
- db_path: ./warehouse/state.sqlite3
|
|
72
|
+
- check_same_thread: false
|
|
73
|
+
- name: lock
|
|
74
|
+
backend: amsdal_data.lock.implementations.thread_lock.ThreadLock
|
|
75
|
+
resources_config:
|
|
76
|
+
repository:
|
|
77
|
+
default: sqlite_state
|
|
78
|
+
lock: lock
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 2. Generate Embeddings
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from amsdal_ml.ml_ingesting.openai_ingesting import OpenAIIngesting
|
|
85
|
+
from amsdal_ml.ml_config import ml_config
|
|
86
|
+
|
|
87
|
+
# Initialize ingesting
|
|
88
|
+
ingester = OpenAIIngesting(
|
|
89
|
+
model=MyModel,
|
|
90
|
+
embedding_field='embedding',
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Generate embeddings for an instance
|
|
94
|
+
instance = MyModel(content='Your text here')
|
|
95
|
+
embeddings = await ingester.agenerate_embeddings(instance)
|
|
96
|
+
await ingester.asave(embeddings, instance)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### 3. Semantic Search
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from amsdal_ml.ml_retrievers.openai_retriever import OpenAIRetriever
|
|
103
|
+
|
|
104
|
+
retriever = OpenAIRetriever()
|
|
105
|
+
|
|
106
|
+
# Search for relevant content
|
|
107
|
+
results = await retriever.asimilarity_search(
|
|
108
|
+
query='What is machine learning?',
|
|
109
|
+
k=5,
|
|
110
|
+
include_tags=['documentation']
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
for chunk in results:
|
|
114
|
+
print(f'{chunk.object_class}:{chunk.object_id} - {chunk.raw_text}')
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### 4. Build an AI Agent
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from amsdal_ml.agents.default_qa_agent import DefaultQAAgent
|
|
121
|
+
|
|
122
|
+
agent = DefaultQAAgent()
|
|
123
|
+
|
|
124
|
+
# Ask questions
|
|
125
|
+
output = await agent.arun('Explain vector embeddings')
|
|
126
|
+
print(output.answer)
|
|
127
|
+
print(f'Used tools: {output.used_tools}')
|
|
128
|
+
|
|
129
|
+
# Stream responses
|
|
130
|
+
async for chunk in agent.astream('What is semantic search?'):
|
|
131
|
+
print(chunk, end='', flush=True)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### 5. Functional Calling Agent with Python Tools
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from amsdal_ml.agents.functional_calling_agent import FunctionalCallingAgent
|
|
138
|
+
from amsdal_ml.agents.python_tool import PythonTool
|
|
139
|
+
from amsdal_ml.ml_models.openai_model import OpenAIModel
|
|
140
|
+
|
|
141
|
+
llm = OpenAIModel()
|
|
142
|
+
agent = FunctionalCallingAgent(model=llm, tools=[search_tool, render_tool])
|
|
143
|
+
result = await agent.arun(user_query="Find products with price > 100", history=[])
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### 6. Natural Language Query Retriever
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from amsdal_ml.ml_retrievers.query_retriever import NLQueryRetriever
|
|
150
|
+
|
|
151
|
+
retriever = NLQueryRetriever(llm=llm, queryset=Product.objects.all())
|
|
152
|
+
documents = await retriever.invoke("Show me red products", limit=10)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 7. Document Ingestion Pipeline
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from amsdal_ml.ml_ingesting import ModelIngester
|
|
159
|
+
from amsdal_ml.ml_ingesting.pipeline import DefaultIngestionPipeline
|
|
160
|
+
from amsdal_ml.ml_ingesting.loaders.pdf_loader import PdfLoader
|
|
161
|
+
from amsdal_ml.ml_ingesting.processors.text_cleaner import TextCleaner
|
|
162
|
+
from amsdal_ml.ml_ingesting.splitters.token_splitter import TokenSplitter
|
|
163
|
+
from amsdal_ml.ml_ingesting.embedders.openai_embedder import OpenAIEmbedder
|
|
164
|
+
from amsdal_ml.ml_ingesting.stores.embedding_data import EmbeddingDataStore
|
|
165
|
+
|
|
166
|
+
pipeline = DefaultIngestionPipeline(
|
|
167
|
+
loader=PdfLoader(), # Uses pymupdf for PDF processing
|
|
168
|
+
cleaner=TextCleaner(),
|
|
169
|
+
splitter=TokenSplitter(max_tokens=800, overlap_tokens=80),
|
|
170
|
+
embedder=OpenAIEmbedder(),
|
|
171
|
+
store=EmbeddingDataStore(),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
ingester = ModelIngester(
|
|
175
|
+
pipeline=pipeline,
|
|
176
|
+
base_tags=["document"],
|
|
177
|
+
base_metadata={"source": "pdf"},
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Architecture
|
|
182
|
+
|
|
183
|
+
### Core Components
|
|
184
|
+
|
|
185
|
+
- **`MLModel`**: Abstract interface for LLM inference (invoke, stream, with attachments)
|
|
186
|
+
- **`MLIngesting`**: Generate text and embeddings from data objects with chunking
|
|
187
|
+
- **`MLRetriever`**: Semantic similarity search with tag-based filtering
|
|
188
|
+
- **`Agent`**: Q&A and task-oriented agents with streaming and citations
|
|
189
|
+
- **`EmbeddingModel`**: Database model storing 1536-dimensional vectors linked to source objects
|
|
190
|
+
- **`PythonTool`**: Tool for executing Python functions within agents
|
|
191
|
+
- **`FunctionalCallingAgent`**: Agent specialized in functional calling with configurable tools
|
|
192
|
+
- **`NLQueryRetriever`**: Retriever for natural language queries on AMSDAL querysets
|
|
193
|
+
- **`DefaultIngestionPipeline`**: Pipeline for document ingestion including loader, cleaner, splitter, embedder, and store
|
|
194
|
+
- **`ModelIngester`**: High-level ingester for processing models with customizable pipelines and metadata
|
|
195
|
+
- **`PdfLoader`**: Document loader using pymupdf for PDF processing
|
|
196
|
+
- **`TextCleaner`**: Processor for cleaning and normalizing text
|
|
197
|
+
- **`TokenSplitter`**: Splitter for dividing text into chunks based on token count
|
|
198
|
+
- **`OpenAIEmbedder`**: Embedder for generating embeddings via OpenAI API
|
|
199
|
+
- **`EmbeddingDataStore`**: Store for saving embedding data linked to source objects
|
|
200
|
+
- **MCP Server/Client**: Expose retrievers as tools or consume external MCP services
|
|
201
|
+
|
|
202
|
+
### Configuration
|
|
203
|
+
|
|
204
|
+
All settings are managed via `MLConfig` in `.env`:
|
|
205
|
+
|
|
206
|
+
```env
|
|
207
|
+
# Model Configuration
|
|
208
|
+
llm_model_name=gpt-4o
|
|
209
|
+
llm_temperature=0.0
|
|
210
|
+
embed_model_name=text-embedding-3-small
|
|
211
|
+
|
|
212
|
+
# Chunking Parameters
|
|
213
|
+
embed_max_depth=2
|
|
214
|
+
embed_max_chunks=10
|
|
215
|
+
embed_max_tokens_per_chunk=800
|
|
216
|
+
|
|
217
|
+
# Retrieval Settings
|
|
218
|
+
retriever_default_k=8
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Development
|
|
222
|
+
|
|
223
|
+
### Setup
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
# Install dependencies
|
|
227
|
+
pip install --upgrade uv hatch==1.14.2
|
|
228
|
+
hatch env create
|
|
229
|
+
hatch run sync
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Testing
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
# Run all tests with coverage
|
|
236
|
+
hatch run cov
|
|
237
|
+
|
|
238
|
+
# Run specific tests
|
|
239
|
+
hatch run test tests/test_openai_model.py
|
|
240
|
+
|
|
241
|
+
# Watch mode
|
|
242
|
+
pytest tests/ -v
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Code Quality
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
# Run all checks (style + typing)
|
|
249
|
+
hatch run all
|
|
250
|
+
|
|
251
|
+
# Format code
|
|
252
|
+
hatch run fmt
|
|
253
|
+
|
|
254
|
+
# Type checking
|
|
255
|
+
hatch run typing
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### AMSDAL CLI
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
# Generate a new model
|
|
262
|
+
amsdal generate model MyModel --format py
|
|
263
|
+
|
|
264
|
+
# Generate property
|
|
265
|
+
amsdal generate property --model MyModel embedding_field
|
|
266
|
+
|
|
267
|
+
# Generate transaction
|
|
268
|
+
amsdal generate transaction ProcessEmbeddings
|
|
269
|
+
|
|
270
|
+
# Generate hook
|
|
271
|
+
amsdal generate hook --model MyModel on_create
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
## MCP Server
|
|
275
|
+
|
|
276
|
+
Run the retriever as an MCP server for integration with Claude Desktop or other MCP clients:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
python -m amsdal_ml.mcp_server.server_retriever_stdio \
|
|
280
|
+
--amsdal-config "$(echo '{"async_mode": true, ...}' | base64)"
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
The server exposes a `search` tool for semantic search in your knowledge base.
|
|
284
|
+
|
|
285
|
+
## License
|
|
286
|
+
|
|
287
|
+
See `amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md` for dependency licenses.
|
|
288
|
+
|
|
289
|
+
## Links
|
|
290
|
+
|
|
291
|
+
- [AMSDAL Framework](https://github.com/amsdal/amsdal)
|
|
292
|
+
- [Documentation](https://docs.amsdal.com)
|
|
293
|
+
- [Issue Tracker](https://github.com/amsdal/amsdal_ml/issues)
|