speedy-utils 1.1.32__tar.gz → 1.1.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speedy_utils-1.1.34/.github/skills/caching-utilities/SKILL.md +133 -0
- speedy_utils-1.1.34/.github/skills/caching-utilities/examples/caching_example.py +37 -0
- speedy_utils-1.1.34/.github/skills/io-utilities/SKILL.md +111 -0
- speedy_utils-1.1.34/.github/skills/io-utilities/examples/io_example.py +35 -0
- speedy_utils-1.1.34/.github/skills/llm-integration/SKILL.md +115 -0
- speedy_utils-1.1.34/.github/skills/llm-integration/examples/llm_example.py +27 -0
- speedy_utils-1.1.34/.github/skills/multi-threading-processing/SKILL.md +448 -0
- speedy_utils-1.1.34/.github/skills/ray-distributed-computing/SKILL.md +704 -0
- speedy_utils-1.1.34/.github/skills/skill-creation/SKILL.md +516 -0
- speedy_utils-1.1.34/.github/skills/vision-utilities/SKILL.md +105 -0
- speedy_utils-1.1.34/.github/skills/vision-utilities/examples/vision_example.py +44 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/PKG-INFO +1 -1
- speedy_utils-1.1.34/docs/IMPLEMENTATION.md +149 -0
- speedy_utils-1.1.34/docs/QUICKSTART.md +109 -0
- speedy_utils-1.1.34/docs/zero_copy_sharing.md +235 -0
- speedy_utils-1.1.34/examples/pytorch_large_model.py +285 -0
- speedy_utils-1.1.34/examples/shared_kwargs_example.py +200 -0
- speedy_utils-1.1.34/examples/test_share_ray.py +24 -0
- speedy_utils-1.1.34/experiments/exp1/dockerfile +20 -0
- speedy_utils-1.1.34/experiments/exp1/run_in_docker.sh +14 -0
- speedy_utils-1.1.34/experiments/exp1/test.png +0 -0
- speedy_utils-1.1.34/experiments/test_read_image.py +37 -0
- speedy_utils-1.1.34/notebooks/README.ipynb +389 -0
- speedy_utils-1.1.34/notebooks/ray_tutorial.ipynb +120 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/pyproject.toml +1 -1
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/scripts/imports.sh +0 -0
- speedy_utils-1.1.34/src/datasets/convert_to_arrow.py +40 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/llm.py +4 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/utils.py +5 -1
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/__imports.py +21 -6
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/multi_worker/process.py +121 -25
- speedy_utils-1.1.34/src/vision_utils/__init__.py +11 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/vision_utils/io_utils.py +15 -14
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/vision_utils/plot.py +2 -2
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/import_time_report.py +2 -1
- speedy_utils-1.1.34/tests/integration_test.py +51 -0
- speedy_utils-1.1.34/tests/test_pytorch_sharing.py +269 -0
- speedy_utils-1.1.34/tests/test_shared_kwargs.py +197 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/uv.lock +29 -15
- speedy_utils-1.1.32/src/vision_utils/__init__.py +0 -4
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/.github/copilot-instructions.md +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/.github/workflows/publish.yml +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/.gitignore +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/.pre-commit-config.yaml +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/README.md +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/bumpversion.sh +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/examples/temperature_range_example.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/examples/vision_utils_example.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/notebooks/llm_utils/llm_as_a_judge.ipynb +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/notebooks/test_multi_thread.ipynb +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/ruff.toml +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/scripts/debug_import_time.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/scripts/deploy.sh +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/scripts/test_import_time_vision.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/setup.cfg +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/chat_format/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/chat_format/display.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/chat_format/transform.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/chat_format/utils.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/group_messages.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/async_lm/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/async_lm/_utils.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/async_lm/async_llm_task.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/async_lm/async_lm.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/async_lm/async_lm_base.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/async_lm/lm_specific.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/base_prompt_builder.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/llm_signature.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/lm_base.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/mixins.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/openai_memoize.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/lm/signature.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/scripts/README.md +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/scripts/vllm_load_balancer.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/scripts/vllm_serve.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/vector_cache/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/vector_cache/cli.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/vector_cache/core.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/vector_cache/types.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/llm_utils/vector_cache/utils.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/clock.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/function_decorator.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/logger.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/notebook_utils.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/patcher.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/report_manager.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/utils_cache.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/utils_io.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/utils_misc.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/common/utils_print.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/multi_worker/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/multi_worker/thread.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/scripts/__init__.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/scripts/mpython.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/src/vision_utils/README.md +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/import_all.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/llm_utils/test_llm_mixins.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/sample_objects.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_logger.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_logger_format.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_memoize_typing.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_mpython.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_multithread_error_trace.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_process.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_process_update.py +0 -0
- {speedy_utils-1.1.32 → speedy_utils-1.1.34}/tests/test_thread.py +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: 'caching-utilities'
|
|
3
|
+
description: 'Guide for using caching utilities in speedy_utils, including memory, disk, and hybrid caching strategies for sync and async functions.'
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Caching Utilities Guide
|
|
7
|
+
|
|
8
|
+
This skill provides comprehensive guidance for using the caching utilities in `speedy_utils`.
|
|
9
|
+
|
|
10
|
+
## When to Use This Skill
|
|
11
|
+
|
|
12
|
+
Use this skill when you need to:
|
|
13
|
+
- Optimize performance by caching expensive function calls.
|
|
14
|
+
- Persist results across program runs using disk caching.
|
|
15
|
+
- Use memory caching for fast access within a single run.
|
|
16
|
+
- Handle caching for both synchronous and asynchronous functions.
|
|
17
|
+
- Use `imemoize` for persistent caching in interactive environments like Jupyter notebooks.
|
|
18
|
+
|
|
19
|
+
## Prerequisites
|
|
20
|
+
|
|
21
|
+
- `speedy_utils` installed in your environment.
|
|
22
|
+
|
|
23
|
+
## Core Capabilities
|
|
24
|
+
|
|
25
|
+
### Universal Memoization (`@memoize`)
|
|
26
|
+
- Supports `memory`, `disk`, and `both` (hybrid) caching backends.
|
|
27
|
+
- Works with both `sync` and `async` functions.
|
|
28
|
+
- Configurable LRU cache size for memory caching.
|
|
29
|
+
- Custom key generation strategies.
|
|
30
|
+
|
|
31
|
+
### Interactive Memoization (`@imemoize`)
|
|
32
|
+
- Designed for Jupyter notebooks and interactive sessions.
|
|
33
|
+
- Persists cache across module reloads (`%load`).
|
|
34
|
+
- Uses global memory cache.
|
|
35
|
+
|
|
36
|
+
### Object Identification (`identify`)
|
|
37
|
+
- Generates stable, content-based identifiers for arbitrary Python objects.
|
|
38
|
+
- Handles complex types like DataFrames, Pydantic models, and nested structures.
|
|
39
|
+
|
|
40
|
+
## Usage Examples
|
|
41
|
+
|
|
42
|
+
### Example 1: Basic Hybrid Caching
|
|
43
|
+
Cache results in both memory and disk.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from speedy_utils import memoize
|
|
47
|
+
import time
|
|
48
|
+
|
|
49
|
+
@memoize(cache_type='both', size=128)
|
|
50
|
+
def expensive_func(x: int):
|
|
51
|
+
time.sleep(1)
|
|
52
|
+
return x * x
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Example 2: Async Disk Caching
|
|
56
|
+
Cache results of an async function to disk.
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from speedy_utils import memoize
|
|
60
|
+
import asyncio
|
|
61
|
+
|
|
62
|
+
@memoize(cache_type='disk', cache_dir='./my_cache')
|
|
63
|
+
async def fetch_data(url: str):
|
|
64
|
+
# simulate network call
|
|
65
|
+
await asyncio.sleep(1)
|
|
66
|
+
return {"data": "content"}
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Example 3: Custom Key Function
|
|
70
|
+
Use a custom key function for complex arguments.
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from speedy_utils import memoize
|
|
74
|
+
|
|
75
|
+
def get_user_id(user):
|
|
76
|
+
return user.id
|
|
77
|
+
|
|
78
|
+
@memoize(key=get_user_id)
|
|
79
|
+
def process_user(user):
|
|
80
|
+
# ...
|
|
81
|
+
pass
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Example 4: Interactive Caching (Notebooks)
|
|
85
|
+
Use `@imemoize` to keep cache even if you reload the cell/module.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from speedy_utils import imemoize
|
|
89
|
+
|
|
90
|
+
@imemoize
|
|
91
|
+
def notebook_func(data):
|
|
92
|
+
# ...
|
|
93
|
+
return result
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Guidelines
|
|
97
|
+
|
|
98
|
+
1. **Choose the Right Backend**:
|
|
99
|
+
- Use `memory` for small, fast results needed frequently in one session.
|
|
100
|
+
- Use `disk` for large results or to persist across runs.
|
|
101
|
+
- Use `both` (default) for the best of both worlds.
|
|
102
|
+
|
|
103
|
+
2. **Key Stability**:
|
|
104
|
+
- Ensure arguments are stable (e.g., avoid using objects with changing internal state as keys unless you provide a custom `key` function).
|
|
105
|
+
- `identify` handles most common types, but be careful with custom classes without `__repr__` or stable serialization.
|
|
106
|
+
|
|
107
|
+
3. **Cache Directory**:
|
|
108
|
+
- Default disk cache is `~/.cache/speedy_cache`.
|
|
109
|
+
- Override `cache_dir` for project-specific caching.
|
|
110
|
+
|
|
111
|
+
4. **Async Support**:
|
|
112
|
+
- The decorators automatically detect `async` functions and handle `await` correctly.
|
|
113
|
+
- Do not mix sync/async usage without proper `await`.
|
|
114
|
+
|
|
115
|
+
## Common Patterns
|
|
116
|
+
|
|
117
|
+
### Pattern: Ignoring `self`
|
|
118
|
+
By default, `ignore_self=True` is set. This means methods on different instances of the same class will share cache if other arguments are the same. Set `ignore_self=False` if the instance state matters.
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
class Processor:
|
|
122
|
+
def __init__(self, multiplier):
|
|
123
|
+
self.multiplier = multiplier
|
|
124
|
+
|
|
125
|
+
@memoize(ignore_self=False)
|
|
126
|
+
def compute(self, x):
|
|
127
|
+
return x * self.multiplier
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Limitations
|
|
131
|
+
|
|
132
|
+
- **Pickle Compatibility**: Disk caching relies on `pickle` (or JSON). Ensure return values are serializable.
|
|
133
|
+
- **Cache Invalidation**: There is no automatic TTL (Time To Live) or expiration. You must manually clear cache files if data becomes stale.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
from speedy_utils import imemoize, memoize
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Sync Hybrid Cache
|
|
8
|
+
@memoize(cache_type='both')
|
|
9
|
+
def slow_square(x: int) -> int:
|
|
10
|
+
print(f"Computing square of {x}...")
|
|
11
|
+
time.sleep(1)
|
|
12
|
+
return x * x
|
|
13
|
+
|
|
14
|
+
# Async Disk Cache
|
|
15
|
+
@memoize(cache_type='disk', cache_dir='./temp_cache')
|
|
16
|
+
async def async_slow_cube(x: int) -> int:
|
|
17
|
+
print(f"Computing cube of {x}...")
|
|
18
|
+
await asyncio.sleep(1)
|
|
19
|
+
return x * x * x
|
|
20
|
+
|
|
21
|
+
# Interactive Cache
|
|
22
|
+
@imemoize
|
|
23
|
+
def interactive_op(x: int) -> int:
|
|
24
|
+
print(f"Interactive op on {x}...")
|
|
25
|
+
return x + 1
|
|
26
|
+
|
|
27
|
+
async def main():
|
|
28
|
+
print("--- Sync Cache ---")
|
|
29
|
+
print(slow_square(2))
|
|
30
|
+
print(slow_square(2)) # Should be instant
|
|
31
|
+
|
|
32
|
+
print("\n--- Async Cache ---")
|
|
33
|
+
print(await async_slow_cube(3))
|
|
34
|
+
print(await async_slow_cube(3)) # Should be instant
|
|
35
|
+
|
|
36
|
+
if __name__ == "__main__":
|
|
37
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: 'io-utilities'
|
|
3
|
+
description: 'Guide for using IO utilities in speedy_utils, including fast JSONL reading, multi-format loading, and file serialization.'
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# IO Utilities Guide
|
|
7
|
+
|
|
8
|
+
This skill provides comprehensive guidance for using the IO utilities in `speedy_utils`.
|
|
9
|
+
|
|
10
|
+
## When to Use This Skill
|
|
11
|
+
|
|
12
|
+
Use this skill when you need to:
|
|
13
|
+
- Read and write data in various formats (JSON, JSONL, Pickle, CSV, TXT).
|
|
14
|
+
- Efficiently process large JSONL files with streaming and multi-threading.
|
|
15
|
+
- Automatically handle file compression (gzip, bz2, xz, zstd).
|
|
16
|
+
- Load data based on file extension automatically.
|
|
17
|
+
- Serialize Pydantic models and other objects easily.
|
|
18
|
+
|
|
19
|
+
## Prerequisites
|
|
20
|
+
|
|
21
|
+
- `speedy_utils` installed.
|
|
22
|
+
- Optional dependencies for specific features:
|
|
23
|
+
- `orjson`: For faster JSON parsing.
|
|
24
|
+
- `zstandard`: For `.zst` file support.
|
|
25
|
+
- `pandas`: For CSV/TSV loading.
|
|
26
|
+
- `pyarrow`: For faster CSV reading with pandas.
|
|
27
|
+
|
|
28
|
+
## Core Capabilities
|
|
29
|
+
|
|
30
|
+
### Fast JSONL Processing (`fast_load_jsonl`)
|
|
31
|
+
- Streams data line-by-line for memory efficiency.
|
|
32
|
+
- Supports automatic decompression.
|
|
33
|
+
- Uses `orjson` if available for speed.
|
|
34
|
+
- Supports multi-threaded processing for large files.
|
|
35
|
+
- Shows progress bar with `tqdm`.
|
|
36
|
+
|
|
37
|
+
### Universal Loading (`load_by_ext`)
|
|
38
|
+
- Detects file type by extension.
|
|
39
|
+
- Supports glob patterns (e.g., `data/*.json`) and lists of files.
|
|
40
|
+
- Uses parallel processing for multiple files.
|
|
41
|
+
- Supports memoization via `do_memoize=True`.
|
|
42
|
+
|
|
43
|
+
### Serialization (`dump_json_or_pickle`, `load_json_or_pickle`)
|
|
44
|
+
- Unified interface for JSON and Pickle.
|
|
45
|
+
- Handles Pydantic models automatically.
|
|
46
|
+
- Creates parent directories if they don't exist.
|
|
47
|
+
|
|
48
|
+
## Usage Examples
|
|
49
|
+
|
|
50
|
+
### Example 1: Streaming Large JSONL
|
|
51
|
+
Read a large compressed JSONL file line by line.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from speedy_utils import fast_load_jsonl
|
|
55
|
+
|
|
56
|
+
# Iterates lazily, low memory usage
|
|
57
|
+
for item in fast_load_jsonl('large_data.jsonl.gz', progress=True):
|
|
58
|
+
process(item)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Example 2: Loading Any File
|
|
62
|
+
Load a file without worrying about the format.
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from speedy_utils import load_by_ext
|
|
66
|
+
|
|
67
|
+
data = load_by_ext('config.json')
|
|
68
|
+
df = load_by_ext('data.csv')
|
|
69
|
+
items = load_by_ext('dataset.pkl')
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Example 3: Parallel Loading
|
|
73
|
+
Load multiple files in parallel.
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from speedy_utils import load_by_ext
|
|
77
|
+
|
|
78
|
+
# Returns a list of results, one for each file
|
|
79
|
+
all_data = load_by_ext('logs/*.jsonl')
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Example 4: Dumping Data
|
|
83
|
+
Save data to disk, creating directories as needed.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from speedy_utils import dump_json_or_pickle
|
|
87
|
+
|
|
88
|
+
data = {"key": "value"}
|
|
89
|
+
dump_json_or_pickle(data, 'output/processed/result.json')
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Guidelines
|
|
93
|
+
|
|
94
|
+
1. **Prefer JSONL for Large Datasets**:
|
|
95
|
+
- Use `fast_load_jsonl` for datasets that don't fit in memory.
|
|
96
|
+
- It handles compression transparently, so keep files compressed (`.jsonl.gz` or `.jsonl.zst`) to save space.
|
|
97
|
+
|
|
98
|
+
2. **Use `load_by_ext` for Scripts**:
|
|
99
|
+
- When writing scripts that might accept different input formats, use `load_by_ext` to be flexible.
|
|
100
|
+
|
|
101
|
+
3. **Error Handling**:
|
|
102
|
+
- `fast_load_jsonl` has an `on_error` parameter (`raise`, `warn`, `skip`) to handle malformed lines gracefully.
|
|
103
|
+
|
|
104
|
+
4. **Performance**:
|
|
105
|
+
- Install `orjson` for significantly faster JSON operations.
|
|
106
|
+
- `load_by_ext` uses `pyarrow` engine for CSVs if available, which is much faster.
|
|
107
|
+
|
|
108
|
+
## Limitations
|
|
109
|
+
|
|
110
|
+
- **Memory Usage**: `load_by_ext` loads the entire file into memory. Use `fast_load_jsonl` for streaming.
|
|
111
|
+
- **Glob Expansion**: `load_by_ext` with glob patterns loads *all* matching files into memory at once (in a list). Be careful with massive datasets.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from speedy_utils import dump_json_or_pickle, fast_load_jsonl, load_by_ext
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main():
|
|
7
|
+
# 1. Create some dummy data
|
|
8
|
+
data = [{"id": i, "value": f"item_{i}"} for i in range(100)]
|
|
9
|
+
|
|
10
|
+
# 2. Dump to JSONL
|
|
11
|
+
print("Dumping to data.jsonl...")
|
|
12
|
+
dump_json_or_pickle(data, 'data.jsonl')
|
|
13
|
+
|
|
14
|
+
# 3. Dump to Pickle
|
|
15
|
+
print("Dumping to data.pkl...")
|
|
16
|
+
dump_json_or_pickle(data, 'data.pkl')
|
|
17
|
+
|
|
18
|
+
# 4. Load using load_by_ext
|
|
19
|
+
print("Loading data.pkl...")
|
|
20
|
+
loaded_pkl = load_by_ext('data.pkl')
|
|
21
|
+
print(f"Loaded {len(loaded_pkl)} items from pickle.")
|
|
22
|
+
|
|
23
|
+
# 5. Stream using fast_load_jsonl
|
|
24
|
+
print("Streaming data.jsonl...")
|
|
25
|
+
count = 0
|
|
26
|
+
for item in fast_load_jsonl('data.jsonl', progress=True):
|
|
27
|
+
count += 1
|
|
28
|
+
print(f"Streamed {count} items from jsonl.")
|
|
29
|
+
|
|
30
|
+
# Cleanup
|
|
31
|
+
os.remove('data.jsonl')
|
|
32
|
+
os.remove('data.pkl')
|
|
33
|
+
|
|
34
|
+
if __name__ == "__main__":
|
|
35
|
+
main()
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: 'llm-integration'
|
|
3
|
+
description: 'Guide for using LLM utilities in speedy_utils, including memoized OpenAI clients and chat format transformations.'
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# LLM Integration Guide
|
|
7
|
+
|
|
8
|
+
This skill provides comprehensive guidance for using the LLM utilities in `speedy_utils`.
|
|
9
|
+
|
|
10
|
+
## When to Use This Skill
|
|
11
|
+
|
|
12
|
+
Use this skill when you need to:
|
|
13
|
+
- Make OpenAI API calls with automatic caching (memoization) to save costs and time.
|
|
14
|
+
- Transform chat messages between different formats (ChatML, ShareGPT, Text).
|
|
15
|
+
- Prepare prompts for local LLM inference.
|
|
16
|
+
|
|
17
|
+
## Prerequisites
|
|
18
|
+
|
|
19
|
+
- `speedy_utils` installed.
|
|
20
|
+
- `openai` package installed for API clients.
|
|
21
|
+
|
|
22
|
+
## Core Capabilities
|
|
23
|
+
|
|
24
|
+
### Memoized OpenAI Clients (`MOpenAI`, `MAsyncOpenAI`)
|
|
25
|
+
- Drop-in replacements for `OpenAI` and `AsyncOpenAI`.
|
|
26
|
+
- Automatically caches `post` (chat completion) requests.
|
|
27
|
+
- Uses `speedy_utils` caching backend (disk/memory).
|
|
28
|
+
- Configurable per-instance caching.
|
|
29
|
+
|
|
30
|
+
### Chat Format Transformation (`transform_messages`)
|
|
31
|
+
- Converts between:
|
|
32
|
+
- `chatml`: List of `{"role": "...", "content": "..."}` dicts.
|
|
33
|
+
- `sharegpt`: Dict with `{"conversations": [{"from": "...", "value": "..."}]}`.
|
|
34
|
+
- `text`: String with `<|im_start|>` tokens.
|
|
35
|
+
- `simulated_chat`: Human/AI transcript format.
|
|
36
|
+
- Supports applying tokenizer templates.
|
|
37
|
+
|
|
38
|
+
## Usage Examples
|
|
39
|
+
|
|
40
|
+
### Example 1: Memoized OpenAI Call
|
|
41
|
+
Make repeated calls without hitting the API twice.
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from llm_utils.lm.openai_memoize import MOpenAI
|
|
45
|
+
|
|
46
|
+
# Initialize just like OpenAI client
|
|
47
|
+
client = MOpenAI(api_key="sk-...")
|
|
48
|
+
|
|
49
|
+
# First call hits the API
|
|
50
|
+
response1 = client.chat.completions.create(
|
|
51
|
+
model="gpt-3.5-turbo",
|
|
52
|
+
messages=[{"role": "user", "content": "Hello"}]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Second call returns cached result instantly
|
|
56
|
+
response2 = client.chat.completions.create(
|
|
57
|
+
model="gpt-3.5-turbo",
|
|
58
|
+
messages=[{"role": "user", "content": "Hello"}]
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Example 2: Async Memoized Call
|
|
63
|
+
Same as above but for async workflows.
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from llm_utils.lm.openai_memoize import MAsyncOpenAI
|
|
67
|
+
import asyncio
|
|
68
|
+
|
|
69
|
+
async def main():
|
|
70
|
+
client = MAsyncOpenAI(api_key="sk-...")
|
|
71
|
+
response = await client.chat.completions.create(
|
|
72
|
+
model="gpt-4",
|
|
73
|
+
messages=[{"role": "user", "content": "Hi"}]
|
|
74
|
+
)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Example 3: Transforming Chat Formats
|
|
78
|
+
Convert ShareGPT format to ChatML.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from llm_utils.chat_format.transform import transform_messages
|
|
82
|
+
|
|
83
|
+
sharegpt_data = {
|
|
84
|
+
"conversations": [
|
|
85
|
+
{"from": "human", "value": "Hi"},
|
|
86
|
+
{"from": "gpt", "value": "Hello there"}
|
|
87
|
+
]
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Convert to ChatML list
|
|
91
|
+
chatml_data = transform_messages(sharegpt_data, frm="sharegpt", to="chatml")
|
|
92
|
+
# Result: [{'role': 'user', 'content': 'Hi'}, {'role': 'assistant', 'content': 'Hello there'}]
|
|
93
|
+
|
|
94
|
+
# Convert to Text string
|
|
95
|
+
text_data = transform_messages(chatml_data, frm="chatml", to="text")
|
|
96
|
+
# Result: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\nHello there<|im_end|>\n<|im_start|>assistant\n"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Guidelines
|
|
100
|
+
|
|
101
|
+
1. **Caching Behavior**:
|
|
102
|
+
- The cache key is generated from the arguments passed to `create`.
|
|
103
|
+
- If you change any parameter (e.g., `temperature`, `model`), it counts as a new request.
|
|
104
|
+
- Cache is persistent if configured (default behavior of `memoize`).
|
|
105
|
+
|
|
106
|
+
2. **Format Detection**:
|
|
107
|
+
- `transform_messages` tries to auto-detect input format, but it's safer to specify `frm` explicitly.
|
|
108
|
+
|
|
109
|
+
3. **Tokenizer Support**:
|
|
110
|
+
- You can pass a HuggingFace `tokenizer` to `transform_messages` to use its specific chat template.
|
|
111
|
+
|
|
112
|
+
## Limitations
|
|
113
|
+
|
|
114
|
+
- **Streaming**: Memoization does NOT work with streaming responses (`stream=True`).
|
|
115
|
+
- **Side Effects**: If your LLM calls rely on randomness (high temperature) and you want different results each time, disable caching or change the seed/input.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from llm_utils.chat_format.transform import transform_messages
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
# 1. Define ShareGPT data
|
|
6
|
+
sharegpt_data = {
|
|
7
|
+
"conversations": [
|
|
8
|
+
{"from": "human", "value": "What is the capital of France?"},
|
|
9
|
+
{"from": "gpt", "value": "The capital of France is Paris."}
|
|
10
|
+
]
|
|
11
|
+
}
|
|
12
|
+
print("Original ShareGPT:", sharegpt_data)
|
|
13
|
+
|
|
14
|
+
# 2. Convert to ChatML
|
|
15
|
+
chatml_data = transform_messages(sharegpt_data, frm="sharegpt", to="chatml")
|
|
16
|
+
print("\nConverted to ChatML:", chatml_data)
|
|
17
|
+
|
|
18
|
+
# 3. Convert to Text (Prompt)
|
|
19
|
+
text_data = transform_messages(chatml_data, frm="chatml", to="text")
|
|
20
|
+
print("\nConverted to Text Prompt:\n", text_data)
|
|
21
|
+
|
|
22
|
+
# 4. Convert to Simulated Chat
|
|
23
|
+
sim_chat = transform_messages(chatml_data, frm="chatml", to="simulated_chat")
|
|
24
|
+
print("\nConverted to Simulated Chat:\n", sim_chat)
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
main()
|