banks 2.3.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. banks-2.4.0/.github/workflows/release.yml +79 -0
  2. banks-2.3.0/CLAUDE.md → banks-2.4.0/AGENTS.md +114 -24
  3. banks-2.4.0/CLAUDE.md +3 -0
  4. {banks-2.3.0 → banks-2.4.0}/PKG-INFO +2 -1
  5. {banks-2.3.0 → banks-2.4.0}/pyproject.toml +3 -0
  6. {banks-2.3.0 → banks-2.4.0}/src/banks/__about__.py +1 -1
  7. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/audio.py +19 -3
  8. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/document.py +45 -6
  9. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/image.py +4 -2
  10. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/video.py +49 -3
  11. {banks-2.3.0 → banks-2.4.0}/src/banks/types.py +103 -3
  12. {banks-2.3.0 → banks-2.4.0}/tests/test_audio.py +49 -1
  13. banks-2.4.0/tests/test_document.py +167 -0
  14. {banks-2.3.0 → banks-2.4.0}/tests/test_image.py +35 -0
  15. {banks-2.3.0 → banks-2.4.0}/tests/test_video.py +54 -1
  16. banks-2.3.0/.github/workflows/release.yml +0 -33
  17. banks-2.3.0/tests/test_document.py +0 -74
  18. {banks-2.3.0 → banks-2.4.0}/.github/workflows/docs.yml +0 -0
  19. {banks-2.3.0 → banks-2.4.0}/.github/workflows/test.yml +0 -0
  20. {banks-2.3.0 → banks-2.4.0}/.gitignore +0 -0
  21. {banks-2.3.0 → banks-2.4.0}/CITATION.cff +0 -0
  22. {banks-2.3.0 → banks-2.4.0}/CODE_OF_CONDUCT.md +0 -0
  23. {banks-2.3.0 → banks-2.4.0}/CONTRIBUTING.md +0 -0
  24. {banks-2.3.0 → banks-2.4.0}/LICENSE.txt +0 -0
  25. {banks-2.3.0 → banks-2.4.0}/MANIFEST.in +0 -0
  26. {banks-2.3.0 → banks-2.4.0}/README.md +0 -0
  27. {banks-2.3.0 → banks-2.4.0}/assets/banks.png +0 -0
  28. {banks-2.3.0 → banks-2.4.0}/cookbook/Prompt_Caching_with_Anthropic.ipynb +0 -0
  29. {banks-2.3.0 → banks-2.4.0}/cookbook/Prompt_Versioning.ipynb +0 -0
  30. {banks-2.3.0 → banks-2.4.0}/cookbook/in_prompt_completion.ipynb +0 -0
  31. {banks-2.3.0 → banks-2.4.0}/docs/config.md +0 -0
  32. {banks-2.3.0 → banks-2.4.0}/docs/examples.md +0 -0
  33. {banks-2.3.0 → banks-2.4.0}/docs/index.md +0 -0
  34. {banks-2.3.0 → banks-2.4.0}/docs/prompt.md +0 -0
  35. {banks-2.3.0 → banks-2.4.0}/docs/python.md +0 -0
  36. {banks-2.3.0 → banks-2.4.0}/docs/registry.md +0 -0
  37. {banks-2.3.0 → banks-2.4.0}/mkdocs.yml +0 -0
  38. {banks-2.3.0 → banks-2.4.0}/src/banks/__init__.py +0 -0
  39. {banks-2.3.0 → banks-2.4.0}/src/banks/cache.py +0 -0
  40. {banks-2.3.0 → banks-2.4.0}/src/banks/config.py +0 -0
  41. {banks-2.3.0 → banks-2.4.0}/src/banks/env.py +0 -0
  42. {banks-2.3.0 → banks-2.4.0}/src/banks/errors.py +0 -0
  43. {banks-2.3.0 → banks-2.4.0}/src/banks/extensions/__init__.py +0 -0
  44. {banks-2.3.0 → banks-2.4.0}/src/banks/extensions/chat.py +0 -0
  45. {banks-2.3.0 → banks-2.4.0}/src/banks/extensions/completion.py +0 -0
  46. {banks-2.3.0 → banks-2.4.0}/src/banks/extensions/docs.py +0 -0
  47. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/__init__.py +0 -0
  48. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/cache_control.py +0 -0
  49. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/lemmatize.py +0 -0
  50. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/tool.py +0 -0
  51. {banks-2.3.0 → banks-2.4.0}/src/banks/filters/xml.py +0 -0
  52. {banks-2.3.0 → banks-2.4.0}/src/banks/prompt.py +0 -0
  53. {banks-2.3.0 → banks-2.4.0}/src/banks/registries/__init__.py +0 -0
  54. {banks-2.3.0 → banks-2.4.0}/src/banks/registries/directory.py +0 -0
  55. {banks-2.3.0 → banks-2.4.0}/src/banks/registries/file.py +0 -0
  56. {banks-2.3.0 → banks-2.4.0}/src/banks/registries/redis.py +0 -0
  57. {banks-2.3.0 → banks-2.4.0}/src/banks/utils.py +0 -0
  58. {banks-2.3.0 → banks-2.4.0}/tests/__init__.py +0 -0
  59. {banks-2.3.0 → banks-2.4.0}/tests/conftest.py +0 -0
  60. {banks-2.3.0 → banks-2.4.0}/tests/data/1x1.pdf +0 -0
  61. {banks-2.3.0 → banks-2.4.0}/tests/data/1x1.png +0 -0
  62. {banks-2.3.0 → banks-2.4.0}/tests/data/empty.mov +0 -0
  63. {banks-2.3.0 → banks-2.4.0}/tests/data/empty.wav +0 -0
  64. {banks-2.3.0 → banks-2.4.0}/tests/e2e/__init__.py +0 -0
  65. {banks-2.3.0 → banks-2.4.0}/tests/e2e/conftest.py +0 -0
  66. {banks-2.3.0 → banks-2.4.0}/tests/e2e/test_completion.py +0 -0
  67. {banks-2.3.0 → banks-2.4.0}/tests/e2e/test_function_calling.py +0 -0
  68. {banks-2.3.0 → banks-2.4.0}/tests/templates/blog.jinja +0 -0
  69. {banks-2.3.0 → banks-2.4.0}/tests/templates/cache.jinja +0 -0
  70. {banks-2.3.0 → banks-2.4.0}/tests/templates/chat.jinja +0 -0
  71. {banks-2.3.0 → banks-2.4.0}/tests/templates/summarize.jinja +0 -0
  72. {banks-2.3.0 → banks-2.4.0}/tests/templates/summarize_lemma.jinja +0 -0
  73. {banks-2.3.0 → banks-2.4.0}/tests/test_cache.py +0 -0
  74. {banks-2.3.0 → banks-2.4.0}/tests/test_cache_control.py +0 -0
  75. {banks-2.3.0 → banks-2.4.0}/tests/test_chat.py +0 -0
  76. {banks-2.3.0 → banks-2.4.0}/tests/test_completion.py +0 -0
  77. {banks-2.3.0 → banks-2.4.0}/tests/test_config.py +0 -0
  78. {banks-2.3.0 → banks-2.4.0}/tests/test_directory_registry.py +0 -0
  79. {banks-2.3.0 → banks-2.4.0}/tests/test_file_registry.py +0 -0
  80. {banks-2.3.0 → banks-2.4.0}/tests/test_prompt.py +0 -0
  81. {banks-2.3.0 → banks-2.4.0}/tests/test_redis_registry.py +0 -0
  82. {banks-2.3.0 → banks-2.4.0}/tests/test_tool.py +0 -0
  83. {banks-2.3.0 → banks-2.4.0}/tests/test_types.py +0 -0
  84. {banks-2.3.0 → banks-2.4.0}/tests/test_utils.py +0 -0
  85. {banks-2.3.0 → banks-2.4.0}/tests/test_xml.py +0 -0
@@ -0,0 +1,79 @@
1
+ name: PyPI Release
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ bump:
7
+ description: "Version bump type"
8
+ required: true
9
+ type: choice
10
+ options:
11
+ - MINOR
12
+ - BUGFIX
13
+ default: BUGFIX
14
+
15
+ jobs:
16
+ release:
17
+ runs-on: ubuntu-latest
18
+ permissions:
19
+ contents: write
20
+
21
+ steps:
22
+ - name: Checkout
23
+ uses: actions/checkout@v4
24
+ with:
25
+ ref: ${{ github.event.repository.default_branch }}
26
+
27
+ - name: Bump version
28
+ run: |
29
+ CURRENT=$(sed -n 's/^__version__ = "\([0-9.]*\).*/\1/p' src/banks/__about__.py)
30
+ IFS=. read -r major minor patch <<< "$CURRENT"
31
+ case "${{ github.event.inputs.bump }}" in
32
+ BUGFIX)
33
+ patch=$((patch + 1))
34
+ ;;
35
+ MINOR)
36
+ minor=$((minor + 1))
37
+ patch=0
38
+ ;;
39
+ *)
40
+ echo "Unexpected bump type: ${{ github.event.inputs.bump }}"
41
+ exit 1
42
+ ;;
43
+ esac
44
+ VERSION="${major}.${minor}.${patch}"
45
+ echo "VERSION=${VERSION}" >> "$GITHUB_ENV"
46
+ echo "Bumped ${CURRENT} -> ${VERSION} (${{ github.event.inputs.bump }})"
47
+
48
+ - name: Update __about__.py on default branch
49
+ run: |
50
+ git config user.name "github-actions[bot]"
51
+ git config user.email "github-actions[bot]@users.noreply.github.com"
52
+ sed -i "s/^__version__ = .*$/__version__ = \"${VERSION}\"/" src/banks/__about__.py
53
+ git diff --quiet && exit 0
54
+ git add src/banks/__about__.py
55
+ git commit -m "chore: set __version__ to ${VERSION} [skip ci]"
56
+ git push origin "${{ github.event.repository.default_branch }}"
57
+
58
+ - name: Create and push tag
59
+ run: |
60
+ git tag "v${VERSION}"
61
+ git push origin "v${VERSION}"
62
+
63
+ - name: Install Hatch
64
+ run: pip install hatch
65
+
66
+ - name: Publish on PyPi
67
+ env:
68
+ HATCH_INDEX_USER: __token__
69
+ HATCH_INDEX_AUTH: ${{ secrets.PYPI_API_TOKEN }}
70
+ run: |
71
+ hatch build
72
+ hatch publish -y
73
+
74
+ - name: Create GitHub Release
75
+ uses: ncipollo/release-action@v1
76
+ with:
77
+ tag: v${{ env.VERSION }}
78
+ artifacts: "dist/*"
79
+ generateReleaseNotes: true
@@ -1,21 +1,33 @@
1
- # CLAUDE.md
1
+ # AGENTS.md
2
2
 
3
- This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
3
+ This file provides guidance to AI coding assistants when working with code in this repository.
4
4
 
5
5
  ## Project Overview
6
6
 
7
7
  Banks is a Python prompt programming language and templating system for LLM applications. It provides a Jinja2-based template engine with specialized extensions and filters for creating dynamic prompts, managing chat messages, handling multimodal content (images/audio/video/documents), and integrating with various LLM providers through LiteLLM.
8
8
 
9
+ ## Quick Reference
10
+
11
+ ```bash
12
+ # Most common commands
13
+ hatch run test # Run unit tests
14
+ hatch run lint:all # Run all linting checks
15
+ hatch run lint:fmt # Auto-format code
16
+ hatch run test tests/test_foo.py # Run specific test file
17
+ ```
18
+
9
19
  ## Development Commands
10
20
 
11
21
  ### Testing
12
22
  - Run tests: `hatch run test`
13
- - Run tests with coverage: `hatch run test-cov`
23
+ - Run tests with coverage: `hatch run test-cov`
14
24
  - Generate coverage report: `hatch run cov`
15
- - Run e2e tests specifically: `hatch run test tests/e2e/`
25
+ - Run specific test file: `hatch run test tests/test_foo.py`
26
+ - Run e2e tests: `hatch run test tests/e2e/` (requires API keys)
16
27
 
17
28
  ### Linting and Type Checking
18
29
  - Format code: `hatch run lint:fmt`
30
+ - Auto-fix lint issues: `hatch run lint:fix`
19
31
  - Check formatting: `hatch run lint:check`
20
32
  - Run type checking: `hatch run lint:typing`
21
33
  - Run pylint: `hatch run lint:lint`
@@ -27,17 +39,17 @@ Banks is a Python prompt programming language and templating system for LLM appl
27
39
 
28
40
  ### Environment Management
29
41
  - All commands use Hatch environments with automatic dependency management
30
- - Use `uv` as the installer for faster dependency resolution
31
- - Python 3.10+ supported across multiple versions (3.10-3.14)
42
+ - Uses `uv` as the installer for faster dependency resolution
43
+ - Python 3.9+ supported (tested on 3.10-3.14)
32
44
 
33
45
  ## Architecture Overview
34
46
 
35
47
  ### Core Components
36
48
 
37
49
  **Prompt Classes** (`src/banks/prompt.py`):
38
- - `BasePrompt`: Base class with common functionality for template rendering, metadata, versioning, and caching
50
+ - `BasePrompt`: Base class with template rendering, metadata, versioning, and caching
39
51
  - `Prompt`: Synchronous prompt rendering with `text()` and `chat_messages()` methods
40
- - `AsyncPrompt`: Asynchronous version for use within asyncio loops (requires `BANKS_ASYNC_ENABLED=true`)
52
+ - `AsyncPrompt`: Asynchronous version (requires `BANKS_ASYNC_ENABLED=true`)
41
53
  - `PromptRegistry`: Protocol interface for prompt storage backends
42
54
 
43
55
  **Type System** (`src/banks/types.py`):
@@ -51,13 +63,21 @@ Banks is a Python prompt programming language and templating system for LLM appl
51
63
  - Async support detection and configuration
52
64
  - Custom template loader integration
53
65
 
66
+ **Error Types** (`src/banks/errors.py`):
67
+ - `MissingDependencyError`: Optional dependencies not installed
68
+ - `AsyncError`: Asyncio support misconfiguration
69
+ - `CanaryWordError`: Canary word leaked (prompt injection detection)
70
+ - `PromptNotFoundError`: Prompt not found in registry
71
+ - `InvalidPromptError`: Invalid prompt format
72
+ - `LLMError`: LLM provider errors
73
+
54
74
  ### Extensions System
55
75
 
56
76
  **Chat Extension** (`src/banks/extensions/chat.py`):
57
77
  - `{% chat role="..." %}...{% endchat %}` blocks for structured message creation
58
78
  - Automatic conversion to `ChatMessage` objects during rendering
59
79
 
60
- **Completion Extension** (`src/banks/extensions/completion.py`):
80
+ **Completion Extension** (`src/banks/extensions/completion.py`):
61
81
  - `{% completion model="..." %}...{% endcompletion %}` for in-prompt LLM calls
62
82
  - Integrated with LiteLLM for multi-provider support
63
83
  - Function calling support within completion blocks
@@ -65,39 +85,49 @@ Banks is a Python prompt programming language and templating system for LLM appl
65
85
  ### Filters System
66
86
 
67
87
  **Core Filters** (`src/banks/filters/`):
68
- - `image`: Convert file paths/URLs to base64-encoded image content blocks
69
- - `audio`: Convert audio files to base64-encoded audio content blocks
88
+ - `image`: Convert file paths/URLs/bytes to base64-encoded image content blocks
89
+ - `audio`: Convert audio files to base64-encoded audio content blocks
70
90
  - `video`: Convert video files to base64-encoded video content blocks
71
91
  - `document`: Convert documents (PDF, TXT, HTML, CSS, XML, CSV, RTF, JS, JSON) to base64-encoded content blocks
72
92
  - `cache_control`: Add Anthropic cache control metadata to content blocks
73
93
  - `tool`: Convert Python callables to LLM function call schemas
74
94
  - `lemmatize`: Text lemmatization using simplemma
75
95
 
96
+ **Filter Pattern**: Filters wrap content in `<content_block>` tags and are only useful within `{% chat %}` blocks.
97
+
76
98
  ### Registry System
77
99
 
78
100
  **Storage Backends** (`src/banks/registries/`):
79
101
  - `DirectoryTemplateRegistry`: File system-based prompt storage
80
- - `FileTemplateRegistry`: Single file-based storage
102
+ - `FileTemplateRegistry`: Single file-based storage
81
103
  - `RedisTemplateRegistry`: Redis-backed storage for distributed scenarios
82
104
  - All registries implement the `PromptRegistry` protocol
83
105
 
106
+ ### Caching System
107
+
108
+ **Render Cache** (`src/banks/cache.py`):
109
+ - `RenderCache`: Protocol interface for caching rendered prompts
110
+ - `DefaultCache`: In-memory cache using pickle-serialized context as key
111
+ - Prevents re-rendering identical template + context combinations
112
+
84
113
  ### Configuration
85
114
 
86
115
  **Config System** (`src/banks/config.py`):
87
116
  - Environment variable-based configuration with `BANKS_` prefix
88
- - `BANKS_ASYNC_ENABLED`: Enable async template rendering
117
+ - `BANKS_ASYNC_ENABLED`: Enable async template rendering (must be set before import)
89
118
  - `BANKS_USER_DATA_PATH`: Custom user data directory
90
119
 
91
120
  ## Key Development Patterns
92
121
 
93
122
  ### Template Rendering Flow
94
123
  1. Templates parsed by Jinja2 environment with Banks extensions
95
- 2. Chat blocks converted to JSON during rendering
124
+ 2. Chat blocks converted to JSON during rendering
96
125
  3. `chat_messages()` parses JSON back to `ChatMessage` objects
97
126
  4. Caching layer prevents re-rendering identical contexts
98
127
 
99
128
  ### Multimodal Content Handling
100
129
  - Images/audio/video/documents converted to base64 during filter application
130
+ - Filters accept file paths, URLs, or raw bytes
101
131
  - Content blocks maintain type safety and metadata
102
132
  - Cache control integrated at content block level
103
133
 
@@ -106,23 +136,83 @@ Banks is a Python prompt programming language and templating system for LLM appl
106
136
  - Docstring parsing for parameter descriptions
107
137
  - Type annotations converted to JSON Schema
108
138
 
109
- ### Async Support Architecture
139
+ ### Async Support Architecture
110
140
  - Global environment state requires async decision at import time
111
141
  - `BANKS_ASYNC_ENABLED` must be set before importing banks modules
112
142
  - `AsyncPrompt` provides `await`-able rendering methods
113
143
 
114
- ## Testing Strategy
144
+ ## Testing
145
+
146
+ ### Test Markers
147
+ - `@pytest.mark.e2e`: End-to-end tests requiring external services
148
+ - `@pytest.mark.redis`: Tests requiring a running Redis instance
149
+
150
+ ### Required Environment Variables for E2E Tests
151
+ - `OPENAI_API_KEY`: For OpenAI-based tests
152
+ - `ANTHROPIC_API_KEY`: For Anthropic-based tests
153
+
154
+ ### Test Data
155
+ - Test fixtures in `tests/data/` (images, audio, video, PDFs)
156
+ - Template examples in `tests/templates/`
157
+
158
+ ### Running Specific Tests
159
+ ```bash
160
+ hatch run test tests/test_image.py # Single file
161
+ hatch run test tests/test_image.py::test_name # Single test
162
+ hatch run test -k "image" # Tests matching pattern
163
+ ```
164
+
165
+ ## Code Style
166
+
167
+ ### Formatting
168
+ - Line length: 120 characters
169
+ - Use ruff for formatting and linting
170
+ - Imports sorted with `banks` as first-party
171
+
172
+ ### Type Hints
173
+ - All public functions should have type annotations
174
+ - Use `from __future__ import annotations` for forward references
175
+ - MyPy strict mode enforced
176
+
177
+ ### Conventions
178
+ - SPDX license headers in all source files
179
+ - Docstrings for public APIs
180
+ - Relative imports banned (use absolute `from banks.x import y`)
181
+
182
+ ## Public API
115
183
 
116
- - Unit tests for individual components in `tests/`
117
- - E2e tests requiring API keys in `tests/e2e/` (marked with `@pytest.mark.e2e`)
118
- - Template examples in `tests/templates/` for integration testing
119
- - Coverage excludes async-specific code paths and deprecated modules
184
+ The main exports from `banks` package:
185
+ ```python
186
+ from banks import Prompt, AsyncPrompt, ChatMessage, config, env
187
+ ```
120
188
 
121
- ## Key Dependencies
189
+ ## Dependencies
122
190
 
191
+ **Core (required)**:
123
192
  - `jinja2`: Core templating engine
124
193
  - `pydantic`: Type validation and serialization
125
- - `litellm`: Multi-provider LLM integration (optional)
126
- - `redis`: Redis registry backend (optional)
127
194
  - `griffe`: Code introspection utilities
128
- - `platformdirs`: Cross-platform data directory handling
195
+ - `platformdirs`: Cross-platform data directory handling
196
+ - `filetype`: File type detection for multimodal content
197
+ - `deprecated`: Deprecation decorators
198
+
199
+ **Optional**:
200
+ - `litellm`: Multi-provider LLM integration (`banks[all]`)
201
+ - `redis`: Redis registry backend (`banks[all]`)
202
+ - `simplemma`: Lemmatization filter (dev dependency)
203
+
204
+ ## CI/CD
205
+
206
+ - **test.yml**: Runs tests on Python 3.10-3.14
207
+ - **docs.yml**: Builds and deploys documentation
208
+ - **release.yml**: Handles package releases
209
+
210
+ ## PR Guidelines
211
+
212
+ Follow conventional commit prefixes for PR titles:
213
+ - `fix:` - Bug fixes
214
+ - `feat:` - New features
215
+ - `chore:` - Maintenance
216
+ - `docs:` - Documentation
217
+ - `refactor:` - Code refactoring
218
+ - `test:` - Test additions/changes
banks-2.4.0/CLAUDE.md ADDED
@@ -0,0 +1,3 @@
1
+ # CLAUDE.md
2
+
3
+ See [AGENTS.md](./AGENTS.md) for development guidance and project context.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: banks
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: A prompt programming language
5
5
  Project-URL: Documentation, https://github.com/masci/banks#readme
6
6
  Project-URL: Issues, https://github.com/masci/banks/issues
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: Implementation :: PyPy
21
21
  Requires-Python: >=3.9
22
22
  Requires-Dist: deprecated
23
23
  Requires-Dist: eval-type-backport; python_version < '3.10'
24
+ Requires-Dist: filetype>=1.2.0
24
25
  Requires-Dist: griffe
25
26
  Requires-Dist: jinja2
26
27
  Requires-Dist: platformdirs
@@ -30,6 +30,7 @@ dependencies = [
30
30
  "deprecated",
31
31
  "eval-type-backport;python_version<'3.10'",
32
32
  "platformdirs",
33
+ "filetype>=1.2.0",
33
34
  ]
34
35
 
35
36
  [project.optional-dependencies]
@@ -78,6 +79,7 @@ lint = "pylint {args:src/banks}"
78
79
  typing = "mypy --install-types --non-interactive {args:src/banks}"
79
80
  all = ["check", "typing", "lint"]
80
81
  fmt = "ruff format {args}"
82
+ fix = "ruff check --fix {args}"
81
83
 
82
84
  [tool.hatch.build.targets.wheel]
83
85
  only-include = ["src/banks", "src/templates"]
@@ -183,6 +185,7 @@ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
183
185
  module = ["litellm.*", "simplemma.*", "deprecated.*"]
184
186
  ignore_missing_imports = true
185
187
 
188
+
186
189
  [tool.pylint]
187
190
  disable = [
188
191
  "line-too-long",
@@ -1,4 +1,4 @@
1
1
  # SPDX-FileCopyrightText: 2023-present Massimiliano Pippi <mpippi@gmail.com>
2
2
  #
3
3
  # SPDX-License-Identifier: MIT
4
- __version__ = "2.3.0"
4
+ __version__ = "2.4.0"
@@ -6,7 +6,9 @@ from pathlib import Path
6
6
  from typing import cast
7
7
  from urllib.parse import urlparse
8
8
 
9
- from banks.types import AudioFormat, ContentBlock, InputAudio
9
+ import filetype # type: ignore[import-untyped]
10
+
11
+ from banks.types import AudioFormat, ContentBlock, InputAudio, resolve_binary
10
12
 
11
13
  BASE64_AUDIO_REGEX = re.compile(r"audio\/.*;base64,.*")
12
14
 
@@ -38,7 +40,18 @@ def _get_audio_format_from_url(url: str) -> AudioFormat:
38
40
  return "mp3"
39
41
 
40
42
 
41
- def audio(value: str) -> str:
43
+ def _get_audio_format_from_bytes(data: bytes) -> AudioFormat:
44
+ """Extract audio format from bytes data using filetype library."""
45
+ kind = filetype.guess(data)
46
+ if kind is not None:
47
+ fmt = kind.extension
48
+ if fmt in ("mp3", "wav", "m4a", "webm", "ogg", "flac"):
49
+ return cast(AudioFormat, fmt)
50
+ # Default to mp3 if format cannot be determined
51
+ return "mp3"
52
+
53
+
54
+ def audio(value: str | bytes) -> str:
42
55
  """Wrap the filtered value into a ContentBlock of type audio.
43
56
 
44
57
  The resulting ChatMessage will have the field `content` populated with a list of ContentBlock objects.
@@ -51,7 +64,10 @@ def audio(value: str) -> str:
51
64
  {{ "https://example.com/audio.mp3" | audio }}
52
65
  ```
53
66
  """
54
- if _is_url(value):
67
+ if isinstance(value, bytes):
68
+ audio_format = _get_audio_format_from_bytes(resolve_binary(value, as_base64=False))
69
+ input_audio = InputAudio.from_bytes(value, audio_format=audio_format)
70
+ elif _is_url(value):
55
71
  audio_format = _get_audio_format_from_url(value)
56
72
  input_audio = InputAudio.from_url(value, audio_format)
57
73
  else:
@@ -1,12 +1,15 @@
1
1
  # SPDX-FileCopyrightText: 2023-present Massimiliano Pippi <mpippi@gmail.com>
2
2
  #
3
3
  # SPDX-License-Identifier: MIT
4
+ import mimetypes
4
5
  import re
5
6
  from pathlib import Path
6
7
  from typing import cast
7
8
  from urllib.parse import urlparse
8
9
 
9
- from banks.types import ContentBlock, DocumentFormat, InputDocument
10
+ import filetype # type: ignore[import-untyped]
11
+
12
+ from banks.types import ContentBlock, DocumentFormat, InputDocument, resolve_binary
10
13
 
11
14
  BASE64_DOCUMENT_REGEX = re.compile(r"(text|application)\/.*;base64,.*")
12
15
 
@@ -36,7 +39,7 @@ def _get_document_format_from_url(url: str) -> DocumentFormat:
36
39
  # text/css
37
40
  # text/plain
38
41
  # text/xml
39
- # text/scv
42
+ # text/csv
40
43
  # text/rtf
41
44
  # text/javascript
42
45
  # application/json
@@ -68,13 +71,46 @@ def _get_document_format_from_url(url: str) -> DocumentFormat:
68
71
  "javascript",
69
72
  "json",
70
73
  ):
74
+ # Because Claude only supports pdf and text, and Gemini only supports a small subset of text formats,
75
+ # we can default to 'txt' for any text-based format that is not pdf. This allows the data to be sent to the llm
76
+ # in an acceptable format, but the LLM should still be able to understand the content: e.g., html, markdown,
77
+ # xml, etc.
71
78
  if path.endswith(f".{fmt}"):
79
+ if fmt == "pdf":
80
+ return cast(DocumentFormat, "pdf")
81
+ return "txt"
82
+ mime = mimetypes.guess_type(path)[0]
83
+ if mime is not None and mime.startswith("text/"):
84
+ return "txt"
85
+ # With urls, the likelihood seems sufficiently high that it's probably a pdf if not otherwise indicated
86
+ if mime is None:
87
+ return "pdf"
88
+ # Document type indicated to be other than pdf or text type
89
+ raise ValueError("Unsupported document format: " + path)
90
+
91
+
92
+ def _get_document_format_from_bytes(data: bytes) -> DocumentFormat:
93
+ """Extract document format from bytes data using filetype library."""
94
+ # First check for pdf (only non text based format) and RTF formats (can be detected by file header)
95
+ kind = filetype.guess(data)
96
+ if kind is not None:
97
+ fmt = kind.extension
98
+ if fmt == "pdf":
72
99
  return cast(DocumentFormat, fmt)
73
- # Default to pdf if format cannot be determined
74
- return "pdf"
100
+
101
+ # filetype is good at detecting binary formats, but not text-based ones.
102
+ # So, this is a good indicator that it's text-based.
103
+ # Because Claude only supports pdf and text, and Gemini only supports a small subset of text formats,
104
+ # we can default to 'txt' for any text-based format that is not pdf. This allows the data to be sent to the llm in
105
+ # an acceptable format, but the LLM should still be able to understand the content: e.g., html, markdown, xml, etc.
106
+ # If detecting text types should become desirable, I recommend using something like Google magicka
107
+ if kind is None or kind.extension == "rtf":
108
+ return "txt"
109
+ # There are many common document types (like word, excel, powerpoint, etc.) that are not supported.
110
+ raise ValueError("Unsupported document format: " + kind.extension)
75
111
 
76
112
 
77
- def document(value: str) -> str:
113
+ def document(value: str | bytes) -> str:
78
114
  """Wrap the filtered value into a ContentBlock of type document.
79
115
 
80
116
  The resulting ChatMessage will have the field `content` populated with a list of ContentBlock objects.
@@ -87,7 +123,10 @@ def document(value: str) -> str:
87
123
  {{ "https://example.com/document.pdf" | document }}
88
124
  ```
89
125
  """
90
- if _is_url(value):
126
+ if isinstance(value, bytes):
127
+ document_format = _get_document_format_from_bytes(resolve_binary(value, as_base64=False))
128
+ input_document = InputDocument.from_bytes(value, document_format=document_format)
129
+ elif _is_url(value):
91
130
  document_format = _get_document_format_from_url(value)
92
131
  input_document = InputDocument.from_url(value, document_format)
93
132
  else:
@@ -22,7 +22,7 @@ def _is_url(string: str) -> bool:
22
22
  return True
23
23
 
24
24
 
25
- def image(value: str) -> str:
25
+ def image(value: str | bytes) -> str:
26
26
  """Wrap the filtered value into a ContentBlock of type image.
27
27
 
28
28
  The resulting ChatMessage will have the field `content` populated with a list of ContentBlock objects.
@@ -38,7 +38,9 @@ def image(value: str) -> str:
38
38
  this filter marks the content to cache by surrounding it with `<content_block>` and
39
39
  `</content_block>`, so it's only useful when used within a `{% chat %}` block.
40
40
  """
41
- if _is_url(value):
41
+ if isinstance(value, bytes):
42
+ image_url = ImageUrl.from_bytes(bytes_str=value)
43
+ elif _is_url(value):
42
44
  image_url = ImageUrl(url=value)
43
45
  else:
44
46
  image_url = ImageUrl.from_path(Path(value))
@@ -6,11 +6,39 @@ from pathlib import Path
6
6
  from typing import cast
7
7
  from urllib.parse import urlparse
8
8
 
9
- from banks.types import ContentBlock, InputVideo, VideoFormat
9
+ import filetype # type: ignore[import-untyped]
10
+ from filetype.types.video import IsoBmff # type: ignore[import-untyped]
11
+
12
+ from banks.types import ContentBlock, InputVideo, VideoFormat, resolve_binary
10
13
 
11
14
  BASE64_VIDEO_REGEX = re.compile(r"video\/.*;base64,.*")
12
15
 
13
16
 
17
+ class M3gp(IsoBmff):
18
+ """
19
+ Implements the 3gp video type matcher.
20
+
21
+ The type matcher in the filetype lib does not work correctly for 3gp files,
22
+ so implement our own here.
23
+ """
24
+
25
+ MIME = "video/3gpp"
26
+ EXTENSION = "3gp"
27
+
28
+ def __init__(self):
29
+ super().__init__(mime=M3gp.MIME, extension=M3gp.EXTENSION)
30
+
31
+ def match(self, buf):
32
+ if not self._is_isobmff(buf):
33
+ return False
34
+
35
+ major_brand, _, compatible_brands = self._get_ftyp(buf)
36
+ for brand in compatible_brands:
37
+ if brand in ["3gp4", "3gp5", "3gpp"]:
38
+ return True
39
+ return major_brand in ["3gp4", "3gp5", "3gpp"]
40
+
41
+
14
42
  def _is_url(string: str) -> bool:
15
43
  """Check if a string is a URL."""
16
44
  result = urlparse(string)
@@ -40,7 +68,22 @@ def _get_video_format_from_url(url: str) -> VideoFormat:
40
68
  return "mp4"
41
69
 
42
70
 
43
- def video(value: str) -> str:
71
+ def _get_video_format_from_bytes(data: bytes) -> VideoFormat:
72
+ """Extract video format from bytes data using filetype library."""
73
+ m3gp = M3gp()
74
+ if m3gp not in filetype.types:
75
+ filetype.add_type(m3gp)
76
+
77
+ kind = filetype.guess(data)
78
+ if kind is not None:
79
+ fmt = kind.extension
80
+ if fmt in ("mp4", "mpg", "mov", "avi", "flv", "webm", "wmv", "3gp"):
81
+ return cast(VideoFormat, fmt)
82
+ # Default to mp4 if format cannot be determined
83
+ return "mp4"
84
+
85
+
86
+ def video(value: str | bytes) -> str:
44
87
  """Wrap the filtered value into a ContentBlock of type video.
45
88
 
46
89
  The resulting ChatMessage will have the field `content` populated with a list of ContentBlock objects.
@@ -53,7 +96,10 @@ def video(value: str) -> str:
53
96
  {{ "https://example.com/video.mp4" | video }}
54
97
  ```
55
98
  """
56
- if _is_url(value):
99
+ if isinstance(value, bytes):
100
+ video_format = _get_video_format_from_bytes(resolve_binary(value, as_base64=False))
101
+ input_video = InputVideo.from_bytes(value, video_format=video_format)
102
+ elif _is_url(value):
57
103
  video_format = _get_video_format_from_url(value)
58
104
  input_video = InputVideo.from_url(value, video_format)
59
105
  else: