amsdal_ml 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.github/workflows/ci.yml +18 -3
  2. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.gitignore +2 -0
  3. amsdal_ml-0.2.0/CLAUDE.md +171 -0
  4. amsdal_ml-0.2.0/PKG-INFO +293 -0
  5. amsdal_ml-0.2.0/README.md +275 -0
  6. amsdal_ml-0.2.0/RELEASE.md +180 -0
  7. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md +617 -0
  8. amsdal_ml-0.2.0/amsdal_ml/__about__.py +1 -0
  9. amsdal_ml-0.2.0/amsdal_ml/agents/__init__.py +13 -0
  10. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/agents/agent.py +5 -7
  11. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/agents/default_qa_agent.py +108 -143
  12. amsdal_ml-0.2.0/amsdal_ml/agents/functional_calling_agent.py +233 -0
  13. amsdal_ml-0.2.0/amsdal_ml/agents/mcp_client_tool.py +46 -0
  14. amsdal_ml-0.2.0/amsdal_ml/agents/python_tool.py +86 -0
  15. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/agents/retriever_tool.py +17 -8
  16. amsdal_ml-0.2.0/amsdal_ml/agents/tool_adapters.py +98 -0
  17. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/fileio/base_loader.py +7 -5
  18. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/fileio/openai_loader.py +16 -17
  19. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_client/base.py +2 -0
  20. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_client/http_client.py +7 -1
  21. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_client/stdio_client.py +21 -18
  22. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/mcp_server/server_retriever_stdio.py +8 -11
  23. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/__init__.py +29 -0
  24. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/default_ingesting.py +49 -51
  25. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/embedders/__init__.py +4 -0
  26. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/embedders/embedder.py +12 -0
  27. amsdal_ml-0.1.3/amsdal_ml/ml_retrievers/openai_retriever.py → amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/embedders/openai_embedder.py +6 -15
  28. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/embedding_data.py +3 -0
  29. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/__init__.py +6 -0
  30. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/folder_loader.py +52 -0
  31. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/loader.py +28 -0
  32. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/pdf_loader.py +136 -0
  33. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/loaders/text_loader.py +44 -0
  34. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/model_ingester.py +278 -0
  35. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/pipeline.py +131 -0
  36. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/pipeline_interface.py +31 -0
  37. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/processors/__init__.py +4 -0
  38. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/processors/cleaner.py +14 -0
  39. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/processors/text_cleaner.py +42 -0
  40. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/splitters/__init__.py +4 -0
  41. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/splitters/splitter.py +15 -0
  42. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/splitters/token_splitter.py +85 -0
  43. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/stores/__init__.py +4 -0
  44. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/stores/embedding_data.py +63 -0
  45. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/stores/store.py +22 -0
  46. amsdal_ml-0.2.0/amsdal_ml/ml_ingesting/types.py +40 -0
  47. amsdal_ml-0.2.0/amsdal_ml/ml_models/models.py +179 -0
  48. amsdal_ml-0.2.0/amsdal_ml/ml_models/openai_model.py +679 -0
  49. amsdal_ml-0.2.0/amsdal_ml/ml_models/utils.py +7 -0
  50. amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/__init__.py +17 -0
  51. amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/adapters.py +93 -0
  52. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_retrievers/default_retriever.py +11 -1
  53. amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/openai_retriever.py +59 -0
  54. amsdal_ml-0.2.0/amsdal_ml/ml_retrievers/query_retriever.py +487 -0
  55. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_retrievers/retriever.py +12 -0
  56. amsdal_ml-0.2.0/amsdal_ml/models/embedding_model.py +21 -0
  57. amsdal_ml-0.2.0/amsdal_ml/prompts/__init__.py +77 -0
  58. amsdal_ml-0.2.0/amsdal_ml/prompts/database_query_agent.prompt +14 -0
  59. amsdal_ml-0.2.0/amsdal_ml/prompts/functional_calling_agent_base.prompt +9 -0
  60. amsdal_ml-0.2.0/amsdal_ml/prompts/nl_query_filter.prompt +318 -0
  61. {amsdal_ml-0.1.3/amsdal_ml/agents/promts → amsdal_ml-0.2.0/amsdal_ml/prompts}/react_chat.prompt +17 -8
  62. amsdal_ml-0.2.0/amsdal_ml/utils/__init__.py +5 -0
  63. amsdal_ml-0.2.0/amsdal_ml/utils/query_utils.py +189 -0
  64. amsdal_ml-0.2.0/change-logs.md +49 -0
  65. amsdal_ml-0.2.0/docker-compose.tests.yml +16 -0
  66. amsdal_ml-0.2.0/latest-changelogs.md +14 -0
  67. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/pyproject.toml +7 -3
  68. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_arun.py +2 -2
  69. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_astream.py +7 -7
  70. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_astream_final_only.py +5 -5
  71. amsdal_ml-0.2.0/tests/agents_tests/test_fakes.py +272 -0
  72. amsdal_ml-0.2.0/tests/agents_tests/test_functional_calling_agent.py +241 -0
  73. amsdal_ml-0.2.0/tests/agents_tests/test_qa_agent_with_nlq_tool.py +268 -0
  74. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/agents_tests/test_tool_call_arguments_async.py +16 -6
  75. amsdal_ml-0.2.0/tests/conftest.py +220 -0
  76. amsdal_ml-0.2.0/tests/fixtures/models/author.py +15 -0
  77. amsdal_ml-0.2.0/tests/fixtures/models/book.py +16 -0
  78. amsdal_ml-0.2.0/tests/fixtures/models/category.py +16 -0
  79. amsdal_ml-0.2.0/tests/fixtures/models/order.py +18 -0
  80. amsdal_ml-0.2.0/tests/fixtures/models/product.py +18 -0
  81. amsdal_ml-0.2.0/tests/fixtures/models/user.py +18 -0
  82. amsdal_ml-0.2.0/tests/fixtures/models/vehicle.py +32 -0
  83. amsdal_ml-0.2.0/tests/ingesting/test_folder_loader_and_rag.py +120 -0
  84. amsdal_ml-0.2.0/tests/ingesting/test_ingestion_components.py +232 -0
  85. amsdal_ml-0.2.0/tests/ingesting/test_model_ingester.py +140 -0
  86. amsdal_ml-0.2.0/tests/nlqueryretriever_tests/mock_tests/__init__.py +0 -0
  87. amsdal_ml-0.2.0/tests/nlqueryretriever_tests/mock_tests/test_retriever_mock.py +651 -0
  88. amsdal_ml-0.2.0/tests/nlqueryretriever_tests/models_tests/__init__.py +0 -0
  89. amsdal_ml-0.2.0/tests/nlqueryretriever_tests/models_tests/test_nl_query_models.py +305 -0
  90. amsdal_ml-0.2.0/tests/nlqueryretriever_tests/schema_tests/__init__.py +0 -0
  91. amsdal_ml-0.2.0/tests/nlqueryretriever_tests/schema_tests/test_comprehensive_schema.py +348 -0
  92. amsdal_ml-0.2.0/tests/nlqueryretriever_tests/schema_tests/test_nested_list_skipping.py +135 -0
  93. amsdal_ml-0.2.0/tests/test_files/.gitkeep +0 -0
  94. amsdal_ml-0.2.0/tests/test_files/pdf/Aspida.pdf +0 -0
  95. amsdal_ml-0.2.0/tests/test_files/pdf/Nassau.pdf +0 -0
  96. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/tests/test_openai_model.py +64 -2
  97. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/uv.lock +1030 -768
  98. amsdal_ml-0.1.3/PKG-INFO +0 -69
  99. amsdal_ml-0.1.3/README.md +0 -52
  100. amsdal_ml-0.1.3/amsdal_ml/__about__.py +0 -1
  101. amsdal_ml-0.1.3/amsdal_ml/agents/promts/__init__.py +0 -58
  102. amsdal_ml-0.1.3/amsdal_ml/ml_models/models.py +0 -87
  103. amsdal_ml-0.1.3/amsdal_ml/ml_models/openai_model.py +0 -371
  104. amsdal_ml-0.1.3/amsdal_ml/models/embedding_model.py +0 -21
  105. amsdal_ml-0.1.3/change-logs.md +0 -24
  106. amsdal_ml-0.1.3/latest-changelogs.md +0 -6
  107. amsdal_ml-0.1.3/tests/agents_tests/test_fakes.py +0 -173
  108. amsdal_ml-0.1.3/tests/conftest.py +0 -105
  109. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal/.dependencies +0 -0
  110. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal/.environment +0 -0
  111. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal/.secrets +0 -0
  112. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.amsdal-cli +0 -0
  113. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.github/workflows/release.yml +0 -0
  114. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/.github/workflows/tag_check.yml +0 -0
  115. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/__init__.py +0 -0
  116. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/app.py +0 -0
  117. {amsdal_ml-0.1.3/amsdal_ml/agents → amsdal_ml-0.2.0/amsdal_ml/fileio}/__init__.py +0 -0
  118. {amsdal_ml-0.1.3/amsdal_ml/fileio → amsdal_ml-0.2.0/amsdal_ml/mcp_client}/__init__.py +0 -0
  119. {amsdal_ml-0.1.3/amsdal_ml/mcp_client → amsdal_ml-0.2.0/amsdal_ml/mcp_server}/__init__.py +0 -0
  120. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/migrations/0000_initial.py +0 -0
  121. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_config.py +0 -0
  122. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/ingesting.py +0 -0
  123. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/ml_ingesting/openai_ingesting.py +0 -0
  124. {amsdal_ml-0.1.3/amsdal_ml/mcp_server → amsdal_ml-0.2.0/amsdal_ml/ml_models}/__init__.py +0 -0
  125. {amsdal_ml-0.1.3/amsdal_ml/ml_ingesting → amsdal_ml-0.2.0/amsdal_ml/models}/__init__.py +0 -0
  126. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/amsdal_ml/py.typed +0 -0
  127. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/config.yml +0 -0
  128. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/license_check.py +0 -0
  129. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/scripts/release.sh +0 -0
  130. {amsdal_ml-0.1.3 → amsdal_ml-0.2.0}/scripts/tag_check.sh +0 -0
  131. {amsdal_ml-0.1.3/amsdal_ml/ml_models → amsdal_ml-0.2.0/tests}/__init__.py +0 -0
  132. {amsdal_ml-0.1.3/amsdal_ml/ml_retrievers → amsdal_ml-0.2.0/tests/agents_tests}/__init__.py +0 -0
  133. {amsdal_ml-0.1.3/amsdal_ml → amsdal_ml-0.2.0/tests/fixtures}/models/__init__.py +0 -0
  134. {amsdal_ml-0.1.3/tests → amsdal_ml-0.2.0/tests/ingesting}/__init__.py +0 -0
  135. {amsdal_ml-0.1.3/tests/agents_tests → amsdal_ml-0.2.0/tests/nlqueryretriever_tests}/__init__.py +0 -0
@@ -23,7 +23,7 @@ jobs:
23
23
  python license_check.py
24
24
 
25
25
  test-lint:
26
- name: Run tests and check style
26
+ name: Run tests and check style (Python ${{ matrix.python-version }}, ${{ matrix.database-backend }})
27
27
  needs: [license-check]
28
28
  runs-on: self-hosted
29
29
  strategy:
@@ -31,6 +31,21 @@ jobs:
31
31
  fail-fast: false
32
32
  matrix:
33
33
  python-version: ["3.11", "3.12"]
34
+ database-backend: ["sqlite", "postgres"]
35
+ services:
36
+ postgres:
37
+ image: pgvector/pgvector:pg16
38
+ env:
39
+ POSTGRES_USER: postgres
40
+ POSTGRES_PASSWORD: example
41
+ POSTGRES_DB: postgres
42
+ ports:
43
+ - 5432:5432
44
+ options: >-
45
+ --health-cmd pg_isready
46
+ --health-interval 10s
47
+ --health-timeout 5s
48
+ --health-retries 5
34
49
  env:
35
50
  PYTHON: ${{ matrix.python-version }}
36
51
  DEPS: yes
@@ -54,9 +69,9 @@ jobs:
54
69
  hatch run sync
55
70
 
56
71
  - name: Run style checks
57
- if: always()
72
+ if: always() && matrix.database-backend == 'sqlite'
58
73
  run: hatch run all
59
74
 
60
75
  - name: Run tests
61
76
  if: always()
62
- run: hatch run cov tests/
77
+ run: hatch run cov tests/ -- --database_backend=${{ matrix.database-backend }}
@@ -1,6 +1,7 @@
1
1
  .venv/
2
2
  venv/
3
3
  /warehouse
4
+ .python-version
4
5
 
5
6
  __pycache__/
6
7
  *.py[cod]
@@ -34,3 +35,4 @@ Thumbs.db
34
35
  /models/
35
36
  /fixtures/
36
37
  /static/
38
+ .tmp
@@ -0,0 +1,171 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ amsdal-ml is a machine learning plugin for the AMSDAL Framework that provides embeddings, vector search, and AI-driven features. It supports both synchronous and asynchronous modes, with primary focus on async operations using OpenAI models.
8
+
9
+ ## Development Commands
10
+
11
+ ### Environment Setup
12
+ ```bash
13
+ # Install dependencies using hatch/uv
14
+ pip install --upgrade uv hatch==1.14.2
15
+ hatch env create
16
+ hatch run sync
17
+ ```
18
+
19
+ ### Testing
20
+ ```bash
21
+ # Run all tests with coverage
22
+ hatch run cov
23
+
24
+ # Run specific test file
25
+ hatch run test tests/test_openai_model.py
26
+
27
+ # Run tests with pytest directly (after env setup)
28
+ pytest tests/
29
+ pytest tests/agents_tests/ # Run agent-specific tests
30
+ ```
31
+
32
+ ### Code Quality
33
+ ```bash
34
+ # Run all checks (style + typing)
35
+ hatch run all
36
+
37
+ # Style checks only
38
+ hatch run style
39
+
40
+ # Format code (fix style issues)
41
+ hatch run fmt
42
+
43
+ # Type checking
44
+ hatch run typing
45
+ ```
46
+
47
+ ### Dependency Management
48
+ ```bash
49
+ # Sync dependencies
50
+ hatch run sync
51
+
52
+ # Update lock file
53
+ hatch run lock
54
+
55
+ # Upgrade all dependencies
56
+ hatch run lock-upgrade
57
+ ```
58
+
59
+ ### AMSDAL CLI Commands
60
+ ```bash
61
+ # Generate new model
62
+ amsdal generate model ModelName --format py
63
+
64
+ # Generate property for model
65
+ amsdal generate property --model ModelName property_name
66
+
67
+ # Generate transaction
68
+ amsdal generate transaction TransactionName
69
+
70
+ # Generate hook
71
+ amsdal generate hook --model ModelName on_create
72
+ ```
73
+
74
+ ## Architecture
75
+
76
+ ### Core Components
77
+
78
+ **ML Models** (`amsdal_ml/ml_models/`)
79
+ - Abstract base class `MLModel` defines the interface for all ML models
80
+ - Supports both sync/async invoke and streaming methods
81
+ - Primary implementation uses OpenAI API
82
+ - All models must implement `setup()`, `teardown()`, `invoke()`, `ainvoke()`, `stream()`, and `astream()`
83
+ - Custom error hierarchy: `ModelError`, `ModelConnectionError`, `ModelRateLimitError`, `ModelAPIError`
84
+
85
+ **ML Ingesting** (`amsdal_ml/ml_ingesting/`)
86
+ - `MLIngesting` abstract base handles text generation and embedding creation from data
87
+ - Creates `EmbeddingData` records that link embeddings to source objects
88
+ - Supports chunk-based processing with configurable depth and token limits
89
+ - Both sync/async methods for text generation and embedding
90
+
91
+ **ML Retrievers** (`amsdal_ml/ml_retrievers/`)
92
+ - `MLRetriever` provides semantic search via similarity_search/asimilarity_search
93
+ - Returns `RetrievalChunk` objects with object metadata, chunk text, distance, and tags
94
+ - Supports filtering by include/exclude tags
95
+ - Configurable k parameter for number of results
96
+
97
+ **Agents** (`amsdal_ml/agents/`)
98
+ - Abstract `Agent` base class for Q&A and task-oriented agents
99
+ - Async-first design (sync methods raise NotImplementedError)
100
+ - Returns `AgentOutput` with answer, used_tools, and citations
101
+ - Supports streaming responses via `astream()`
102
+ - File attachments supported through `FileAttachment` interface
103
+
104
+ **MCP Integration**
105
+ - **Server** (`amsdal_ml/mcp_server/`): Exposes retriever search as MCP tool via stdio
106
+ - **Client** (`amsdal_ml/mcp_client/`): Supports both stdio and HTTP transports for calling MCP tools
107
+ - Server accepts base64-encoded AMSDAL config for initialization
108
+
109
+ **File I/O** (`amsdal_ml/fileio/`)
110
+ - `BaseFileLoader` abstract class for uploading files to ML providers
111
+ - `FileAttachment` represents processed attachments (types: PLAIN_TEXT, FILE_ID)
112
+ - `FileItem` helper for creating attachments from paths, bytes, or strings
113
+
114
+ ### Data Models
115
+
116
+ **EmbeddingModel** (`amsdal_ml/models/embedding_model.py`)
117
+ - Core model storing embeddings in database
118
+ - Links to source object via `data_object_class` and `data_object_id`
119
+ - Stores 1536-dimensional vectors (OpenAI text-embedding-3-small default)
120
+ - Includes chunk_index, raw_text, tags, and ml_metadata fields
121
+
122
+ ### Configuration
123
+
124
+ **MLConfig** (`amsdal_ml/ml_config.py`)
125
+ - Loaded from `.env` file using pydantic-settings
126
+ - Key settings:
127
+ - `ml_model_class`: Path to ML model implementation
128
+ - `ml_retriever_class`: Path to retriever implementation
129
+ - `ml_ingesting_class`: Path to ingesting implementation
130
+ - `llm_model_name`: Default 'gpt-4o'
131
+ - `embed_model_name`: Default 'text-embedding-3-small'
132
+ - `embed_max_depth`, `embed_max_chunks`, `embed_max_tokens_per_chunk`: Chunking parameters
133
+ - `retriever_default_k`: Number of results for similarity search
134
+ - `openai_api_key`, `claude_api_key`: API credentials
135
+ - `embedding_targets`: List of models to embed
136
+
137
+ **Database Config** (`config.yml`)
138
+ - Defines AMSDAL connections (sqlite_history, sqlite_state, lock)
139
+ - Resources config maps repository and lakehouse to connections
140
+ - Set `async_mode: true` for async operations
141
+
142
+ ## Code Style
143
+
144
+ - Python 3.11+ required
145
+ - Uses Ruff for linting and formatting with 120-char line length
146
+ - Single quotes enforced (`quote-style = "single"`)
147
+ - Import ordering: force-single-line with order-by-type
148
+ - Type checking via mypy with strict settings (disallow_any_generics, check_untyped_defs)
149
+ - Excludes migrations directory from linting
150
+
151
+ ## Testing
152
+
153
+ - Uses pytest with pytest-asyncio for async tests
154
+ - Test fixtures in `tests/conftest.py` provide mocked OpenAI clients
155
+ - `OPENAI_API_KEY` set to dummy value in tests via fixture
156
+ - Coverage tracking with coverage.py
157
+
158
+ ## CI/CD
159
+
160
+ The project uses self-hosted runners with two jobs:
161
+ 1. **license-check**: Validates third-party licenses using `license_check.py`
162
+ 2. **test-lint**: Runs on Python 3.11 and 3.12, executes `hatch run all` (style+typing) and `hatch run cov`
163
+
164
+ ## Key Patterns
165
+
166
+ 1. **Async-First**: Most components prioritize async methods; sync methods often raise NotImplementedError
167
+ 2. **Abstract Base Classes**: Heavy use of ABCs to define interfaces for models, retrievers, ingesters, and agents
168
+ 3. **Configuration via Pydantic**: Settings loaded from environment with type validation
169
+ 4. **AMSDAL Integration**: Uses AMSDAL's model system, manager, and connection framework
170
+ 5. **Chunking Strategy**: Text split into chunks with metadata preservation for better embedding quality
171
+ 6. **Tag-Based Filtering**: Embeddings tagged for fine-grained retrieval control
@@ -0,0 +1,293 @@
1
+ Metadata-Version: 2.4
2
+ Name: amsdal_ml
3
+ Version: 0.2.0
4
+ Summary: amsdal_ml plugin for AMSDAL Framework
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: aiohttp==3.12.15
7
+ Requires-Dist: amsdal-cli>=0.5.7
8
+ Requires-Dist: amsdal-data>=0.5.9
9
+ Requires-Dist: amsdal-models>=0.5.9
10
+ Requires-Dist: amsdal-utils>=0.5.4
11
+ Requires-Dist: amsdal>=0.5.6
12
+ Requires-Dist: mcp>=0.1
13
+ Requires-Dist: openai==1.100.2
14
+ Requires-Dist: pydantic-settings==2.10.1
15
+ Requires-Dist: pydantic==2.11.7
16
+ Requires-Dist: pymupdf>=1.24.10
17
+ Description-Content-Type: text/markdown
18
+
19
+ # AMSDAL ML
20
+
21
+ [![CI](https://github.com/amsdal/amsdal_ml/actions/workflows/ci.yml/badge.svg)](https://github.com/amsdal/amsdal_ml/actions/workflows/ci.yml)
22
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
23
+
24
+ Machine learning plugin for the AMSDAL Framework, providing embeddings, vector search, semantic retrieval, and AI agents with support for OpenAI models.
25
+
26
+ ## Features
27
+
28
+ - **Vector Embeddings**: Generate and store embeddings for any AMSDAL model with automatic chunking
29
+ - **Semantic Search**: Query your data using natural language with tag-based filtering
30
+ - **AI Agents**: Build Q&A systems with streaming support and citation tracking
31
+ - **Async-First**: Optimized for high-performance async operations
32
+ - **MCP Integration**: Expose and consume tools via Model Context Protocol (stdio/HTTP)
33
+ - **File Attachments**: Process and embed documents with built-in loaders
34
+ - **Extensible**: Abstract base classes for custom models, retrievers, and ingesters
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install amsdal-ml
40
+ ```
41
+
42
+ ### Requirements
43
+
44
+ - Python 3.11 or higher
45
+ - AMSDAL Framework 0.5.6+
46
+ - OpenAI API key (for default implementations)
47
+
48
+ ## Quick Start
49
+
50
+ ### 1. Configuration
51
+
52
+ Create a `.env` file in your project root:
53
+
54
+ ```env
55
+ OPENAI_API_KEY=sk-your-api-key-here
56
+ async_mode=true
57
+ ml_model_class=amsdal_ml.ml_models.openai_model.OpenAIModel
58
+ ml_retriever_class=amsdal_ml.ml_retrievers.openai_retriever.OpenAIRetriever
59
+ ml_ingesting_class=amsdal_ml.ml_ingesting.openai_ingesting.OpenAIIngesting
60
+ ```
61
+
62
+ Create a `config.yml` for AMSDAL connections:
63
+
64
+ ```yaml
65
+ application_name: my-ml-app
66
+ async_mode: true
67
+ connections:
68
+ - name: sqlite_state
69
+ backend: sqlite-state-async
70
+ credentials:
71
+ - db_path: ./warehouse/state.sqlite3
72
+ - check_same_thread: false
73
+ - name: lock
74
+ backend: amsdal_data.lock.implementations.thread_lock.ThreadLock
75
+ resources_config:
76
+ repository:
77
+ default: sqlite_state
78
+ lock: lock
79
+ ```
80
+
81
+ ### 2. Generate Embeddings
82
+
83
+ ```python
84
+ from amsdal_ml.ml_ingesting.openai_ingesting import OpenAIIngesting
85
+ from amsdal_ml.ml_config import ml_config
86
+
87
+ # Initialize ingesting
88
+ ingester = OpenAIIngesting(
89
+ model=MyModel,
90
+ embedding_field='embedding',
91
+ )
92
+
93
+ # Generate embeddings for an instance
94
+ instance = MyModel(content='Your text here')
95
+ embeddings = await ingester.agenerate_embeddings(instance)
96
+ await ingester.asave(embeddings, instance)
97
+ ```
98
+
99
+ ### 3. Semantic Search
100
+
101
+ ```python
102
+ from amsdal_ml.ml_retrievers.openai_retriever import OpenAIRetriever
103
+
104
+ retriever = OpenAIRetriever()
105
+
106
+ # Search for relevant content
107
+ results = await retriever.asimilarity_search(
108
+ query='What is machine learning?',
109
+ k=5,
110
+ include_tags=['documentation']
111
+ )
112
+
113
+ for chunk in results:
114
+ print(f'{chunk.object_class}:{chunk.object_id} - {chunk.raw_text}')
115
+ ```
116
+
117
+ ### 4. Build an AI Agent
118
+
119
+ ```python
120
+ from amsdal_ml.agents.default_qa_agent import DefaultQAAgent
121
+
122
+ agent = DefaultQAAgent()
123
+
124
+ # Ask questions
125
+ output = await agent.arun('Explain vector embeddings')
126
+ print(output.answer)
127
+ print(f'Used tools: {output.used_tools}')
128
+
129
+ # Stream responses
130
+ async for chunk in agent.astream('What is semantic search?'):
131
+ print(chunk, end='', flush=True)
132
+ ```
133
+
134
+ ### 5. Functional Calling Agent with Python Tools
135
+
136
+ ```python
137
+ from amsdal_ml.agents.functional_calling_agent import FunctionalCallingAgent
138
+ from amsdal_ml.agents.python_tool import PythonTool
139
+ from amsdal_ml.ml_models.openai_model import OpenAIModel
140
+
141
+ llm = OpenAIModel()
142
+ agent = FunctionalCallingAgent(model=llm, tools=[search_tool, render_tool])
143
+ result = await agent.arun(user_query="Find products with price > 100", history=[])
144
+ ```
145
+
146
+ ### 6. Natural Language Query Retriever
147
+
148
+ ```python
149
+ from amsdal_ml.ml_retrievers.query_retriever import NLQueryRetriever
150
+
151
+ retriever = NLQueryRetriever(llm=llm, queryset=Product.objects.all())
152
+ documents = await retriever.invoke("Show me red products", limit=10)
153
+ ```
154
+
155
+ ### 7. Document Ingestion Pipeline
156
+
157
+ ```python
158
+ from amsdal_ml.ml_ingesting import ModelIngester
159
+ from amsdal_ml.ml_ingesting.pipeline import DefaultIngestionPipeline
160
+ from amsdal_ml.ml_ingesting.loaders.pdf_loader import PdfLoader
161
+ from amsdal_ml.ml_ingesting.processors.text_cleaner import TextCleaner
162
+ from amsdal_ml.ml_ingesting.splitters.token_splitter import TokenSplitter
163
+ from amsdal_ml.ml_ingesting.embedders.openai_embedder import OpenAIEmbedder
164
+ from amsdal_ml.ml_ingesting.stores.embedding_data import EmbeddingDataStore
165
+
166
+ pipeline = DefaultIngestionPipeline(
167
+ loader=PdfLoader(), # Uses pymupdf for PDF processing
168
+ cleaner=TextCleaner(),
169
+ splitter=TokenSplitter(max_tokens=800, overlap_tokens=80),
170
+ embedder=OpenAIEmbedder(),
171
+ store=EmbeddingDataStore(),
172
+ )
173
+
174
+ ingester = ModelIngester(
175
+ pipeline=pipeline,
176
+ base_tags=["document"],
177
+ base_metadata={"source": "pdf"},
178
+ )
179
+ ```
180
+
181
+ ## Architecture
182
+
183
+ ### Core Components
184
+
185
+ - **`MLModel`**: Abstract interface for LLM inference (invoke, stream, with attachments)
186
+ - **`MLIngesting`**: Generate text and embeddings from data objects with chunking
187
+ - **`MLRetriever`**: Semantic similarity search with tag-based filtering
188
+ - **`Agent`**: Q&A and task-oriented agents with streaming and citations
189
+ - **`EmbeddingModel`**: Database model storing 1536-dimensional vectors linked to source objects
190
+ - **`PythonTool`**: Tool for executing Python functions within agents
191
+ - **`FunctionalCallingAgent`**: Agent specialized in functional calling with configurable tools
192
+ - **`NLQueryRetriever`**: Retriever for natural language queries on AMSDAL querysets
193
+ - **`DefaultIngestionPipeline`**: Pipeline for document ingestion including loader, cleaner, splitter, embedder, and store
194
+ - **`ModelIngester`**: High-level ingester for processing models with customizable pipelines and metadata
195
+ - **`PdfLoader`**: Document loader using pymupdf for PDF processing
196
+ - **`TextCleaner`**: Processor for cleaning and normalizing text
197
+ - **`TokenSplitter`**: Splitter for dividing text into chunks based on token count
198
+ - **`OpenAIEmbedder`**: Embedder for generating embeddings via OpenAI API
199
+ - **`EmbeddingDataStore`**: Store for saving embedding data linked to source objects
200
+ - **MCP Server/Client**: Expose retrievers as tools or consume external MCP services
201
+
202
+ ### Configuration
203
+
204
+ All settings are managed via `MLConfig` in `.env`:
205
+
206
+ ```env
207
+ # Model Configuration
208
+ llm_model_name=gpt-4o
209
+ llm_temperature=0.0
210
+ embed_model_name=text-embedding-3-small
211
+
212
+ # Chunking Parameters
213
+ embed_max_depth=2
214
+ embed_max_chunks=10
215
+ embed_max_tokens_per_chunk=800
216
+
217
+ # Retrieval Settings
218
+ retriever_default_k=8
219
+ ```
220
+
221
+ ## Development
222
+
223
+ ### Setup
224
+
225
+ ```bash
226
+ # Install dependencies
227
+ pip install --upgrade uv hatch==1.14.2
228
+ hatch env create
229
+ hatch run sync
230
+ ```
231
+
232
+ ### Testing
233
+
234
+ ```bash
235
+ # Run all tests with coverage
236
+ hatch run cov
237
+
238
+ # Run specific tests
239
+ hatch run test tests/test_openai_model.py
240
+
241
+ # Watch mode
242
+ pytest tests/ -v
243
+ ```
244
+
245
+ ### Code Quality
246
+
247
+ ```bash
248
+ # Run all checks (style + typing)
249
+ hatch run all
250
+
251
+ # Format code
252
+ hatch run fmt
253
+
254
+ # Type checking
255
+ hatch run typing
256
+ ```
257
+
258
+ ### AMSDAL CLI
259
+
260
+ ```bash
261
+ # Generate a new model
262
+ amsdal generate model MyModel --format py
263
+
264
+ # Generate property
265
+ amsdal generate property --model MyModel embedding_field
266
+
267
+ # Generate transaction
268
+ amsdal generate transaction ProcessEmbeddings
269
+
270
+ # Generate hook
271
+ amsdal generate hook --model MyModel on_create
272
+ ```
273
+
274
+ ## MCP Server
275
+
276
+ Run the retriever as an MCP server for integration with Claude Desktop or other MCP clients:
277
+
278
+ ```bash
279
+ python -m amsdal_ml.mcp_server.server_retriever_stdio \
280
+ --amsdal-config "$(echo '{"async_mode": true, ...}' | base64)"
281
+ ```
282
+
283
+ The server exposes a `search` tool for semantic search in your knowledge base.
284
+
285
+ ## License
286
+
287
+ See `amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md` for dependency licenses.
288
+
289
+ ## Links
290
+
291
+ - [AMSDAL Framework](https://github.com/amsdal/amsdal)
292
+ - [Documentation](https://docs.amsdal.com)
293
+ - [Issue Tracker](https://github.com/amsdal/amsdal_ml/issues)