speedy-utils 1.1.35__tar.gz → 1.1.38__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/PKG-INFO +1 -1
- speedy_utils-1.1.38/debug/debug_generate_response.py +33 -0
- speedy_utils-1.1.38/debug/debug_n_param.py +22 -0
- speedy_utils-1.1.38/debug/debug_n_structure.py +33 -0
- speedy_utils-1.1.38/debug/integration_test.py +88 -0
- speedy_utils-1.1.38/debug/test_generate.py +78 -0
- speedy_utils-1.1.38/debug/test_generate_endpoint.py +28 -0
- speedy_utils-1.1.38/docs/GENERATE_QUICKREF.md +125 -0
- speedy_utils-1.1.38/docs/TOKENIZATION.md +298 -0
- speedy_utils-1.1.38/examples/generate_example.py +140 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/pyproject.toml +2 -1
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/llm.py +86 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/mixins.py +204 -0
- speedy_utils-1.1.38/src/llm_utils/scripts/fast_vllm.py +131 -0
- speedy_utils-1.1.35/docs/TOKENIZATION.md +0 -149
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/copilot-instructions.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/caching-utilities/SKILL.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/caching-utilities/examples/caching_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/io-utilities/SKILL.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/io-utilities/examples/io_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/llm-integration/SKILL.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/llm-integration/examples/llm_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/multi-threading-processing/SKILL.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/ray-distributed-computing/SKILL.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/skill-creation/SKILL.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/vision-utilities/SKILL.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/skills/vision-utilities/examples/vision_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.github/workflows/publish.yml +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.gitignore +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/.pre-commit-config.yaml +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/README.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/bumpversion.sh +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/debug/test_decode_api.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/debug/test_endpoints.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/docs/IMPLEMENTATION.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/docs/QUICKSTART.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/docs/TOKENIZATION_IMPLEMENTATION.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/docs/zero_copy_sharing.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/examples/pytorch_large_model.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/examples/shared_kwargs_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/examples/temperature_range_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/examples/test_share_ray.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/examples/tokenization_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/examples/vision_utils_example.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/experiments/exp1/dockerfile +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/experiments/exp1/run_in_docker.sh +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/experiments/exp1/test.png +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/experiments/test_read_image.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/notebooks/README.ipynb +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/notebooks/llm_utils/llm_as_a_judge.ipynb +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/notebooks/ray_tutorial.ipynb +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/notebooks/test_multi_thread.ipynb +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/ruff.toml +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/scripts/debug_import_time.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/scripts/deploy.sh +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/scripts/imports.sh +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/scripts/test_import_time_vision.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/setup.cfg +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/datasets/convert_to_arrow.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/chat_format/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/chat_format/display.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/chat_format/transform.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/chat_format/utils.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/group_messages.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/async_lm/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/async_lm/_utils.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/async_lm/async_llm_task.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/async_lm/async_lm.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/async_lm/async_lm_base.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/async_lm/lm_specific.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/base_prompt_builder.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/llm_signature.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/lm_base.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/openai_memoize.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/signature.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/lm/utils.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/scripts/README.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/scripts/vllm_load_balancer.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/scripts/vllm_serve.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/vector_cache/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/vector_cache/cli.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/vector_cache/core.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/vector_cache/types.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/llm_utils/vector_cache/utils.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/__imports.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/clock.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/function_decorator.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/logger.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/notebook_utils.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/patcher.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/report_manager.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/utils_cache.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/utils_io.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/utils_misc.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/common/utils_print.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/multi_worker/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/multi_worker/process.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/multi_worker/thread.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/scripts/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/scripts/mpython.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/vision_utils/README.md +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/vision_utils/__init__.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/vision_utils/io_utils.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/src/vision_utils/plot.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/import_all.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/import_time_report.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/integration_test.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/llm_utils/test_llm_mixins.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/sample_objects.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_logger.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_logger_format.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_memoize_typing.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_mpython.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_multithread_error_trace.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_process.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_process_update.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_pytorch_sharing.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_shared_kwargs.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_thread.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/tests/test_tokenization.py +0 -0
- {speedy_utils-1.1.35 → speedy_utils-1.1.38}/uv.lock +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Debug the actual response from generate endpoint."""
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
base_url = 'http://localhost:8000'
|
|
7
|
+
|
|
8
|
+
# Encode input first
|
|
9
|
+
response = requests.post(
|
|
10
|
+
f'{base_url}/tokenize',
|
|
11
|
+
json={'prompt': 'Hello, how are you?', 'add_special_tokens': True},
|
|
12
|
+
)
|
|
13
|
+
token_ids = response.json()['tokens']
|
|
14
|
+
print(f'Input token IDs: {token_ids}')
|
|
15
|
+
print()
|
|
16
|
+
|
|
17
|
+
# Call generate
|
|
18
|
+
request_data = {
|
|
19
|
+
'token_ids': token_ids,
|
|
20
|
+
'sampling_params': {
|
|
21
|
+
'max_tokens': 20,
|
|
22
|
+
'temperature': 0.7,
|
|
23
|
+
'n': 1,
|
|
24
|
+
},
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
response = requests.post(
|
|
28
|
+
f'{base_url}/inference/v1/generate',
|
|
29
|
+
json=request_data,
|
|
30
|
+
)
|
|
31
|
+
print(f'Status code: {response.status_code}')
|
|
32
|
+
print(f'Response JSON:')
|
|
33
|
+
print(json.dumps(response.json(), indent=2))
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Debug n parameter."""
|
|
2
|
+
|
|
3
|
+
from llm_utils.lm import LLM
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
lm = LLM(client=8000)
|
|
7
|
+
|
|
8
|
+
# Test with different n values
|
|
9
|
+
for n_val in [1, 2, 3, 4]:
|
|
10
|
+
print(f'\nTesting n={n_val}')
|
|
11
|
+
results = lm.generate('AI is', max_tokens=10, n=n_val, temperature=0.8)
|
|
12
|
+
|
|
13
|
+
if isinstance(results, list):
|
|
14
|
+
print(f' Returned {len(results)} results (expected {n_val})')
|
|
15
|
+
raw = results[0].get('_raw_response', {})
|
|
16
|
+
else:
|
|
17
|
+
print(f' Returned single dict (expected {n_val})')
|
|
18
|
+
raw = results.get('_raw_response', {})
|
|
19
|
+
|
|
20
|
+
# Check raw response
|
|
21
|
+
if 'choices' in raw:
|
|
22
|
+
print(f' Raw response has {len(raw["choices"])} choices')
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Debug n parameter structure."""
|
|
2
|
+
|
|
3
|
+
from llm_utils.lm import LLM
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
lm = LLM(client=8000)
|
|
7
|
+
|
|
8
|
+
result = lm.generate('Test', max_tokens=5, n=3, temperature=0.8)
|
|
9
|
+
|
|
10
|
+
print('Type:', type(result))
|
|
11
|
+
print('Length:', len(result) if isinstance(result, list) else 'N/A')
|
|
12
|
+
print()
|
|
13
|
+
|
|
14
|
+
if isinstance(result, list):
|
|
15
|
+
print('First result keys:', result[0].keys())
|
|
16
|
+
raw = result[0]['_raw_response']
|
|
17
|
+
print('Raw response type:', type(raw))
|
|
18
|
+
print('Raw response keys:', raw.keys() if isinstance(raw, dict) else 'N/A')
|
|
19
|
+
print()
|
|
20
|
+
|
|
21
|
+
# Get actual response
|
|
22
|
+
import requests
|
|
23
|
+
base_url = 'http://localhost:8000'
|
|
24
|
+
token_ids = lm.encode('Test')
|
|
25
|
+
response = requests.post(
|
|
26
|
+
f'{base_url}/inference/v1/generate',
|
|
27
|
+
json={
|
|
28
|
+
'token_ids': token_ids,
|
|
29
|
+
'sampling_params': {'max_tokens': 5, 'n': 3, 'temperature': 0.8},
|
|
30
|
+
},
|
|
31
|
+
)
|
|
32
|
+
print('Actual API response:')
|
|
33
|
+
print(json.dumps(response.json(), indent=2))
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Final integration test for encode/decode/generate functionality."""
|
|
2
|
+
|
|
3
|
+
from llm_utils.lm import LLM
|
|
4
|
+
|
|
5
|
+
print('Testing LLM encode/decode/generate integration...\n')
|
|
6
|
+
|
|
7
|
+
# Initialize
|
|
8
|
+
lm = LLM(client=8000)
|
|
9
|
+
|
|
10
|
+
# Test 1: Encode/Decode
|
|
11
|
+
print('✓ Test 1: Encode/Decode')
|
|
12
|
+
text = 'Hello, world!'
|
|
13
|
+
tokens = lm.encode(text)
|
|
14
|
+
decoded = lm.decode(tokens)
|
|
15
|
+
print(f' Original: {text}')
|
|
16
|
+
print(f' Tokens: {tokens}')
|
|
17
|
+
print(f' Decoded: {decoded}')
|
|
18
|
+
assert isinstance(tokens, list)
|
|
19
|
+
assert all(isinstance(t, int) for t in tokens)
|
|
20
|
+
print()
|
|
21
|
+
|
|
22
|
+
# Test 2: Generate from text
|
|
23
|
+
print('✓ Test 2: Generate from text')
|
|
24
|
+
result = lm.generate('The answer is', max_tokens=10, temperature=0.5)
|
|
25
|
+
print(f' Input: "The answer is"')
|
|
26
|
+
print(f' Output: {result["text"][:50]}')
|
|
27
|
+
assert 'text' in result
|
|
28
|
+
print()
|
|
29
|
+
|
|
30
|
+
# Test 3: Generate from token IDs
|
|
31
|
+
print('✓ Test 3: Generate from token IDs')
|
|
32
|
+
input_ids = lm.encode('Python is')
|
|
33
|
+
result = lm.generate(input_ids, max_tokens=15, return_token_ids=True)
|
|
34
|
+
print(f' Input IDs: {input_ids}')
|
|
35
|
+
print(f' Output: {result["text"][:50]}')
|
|
36
|
+
print(f' Output IDs: {result.get("token_ids", "N/A")}')
|
|
37
|
+
assert 'text' in result
|
|
38
|
+
assert 'token_ids' in result
|
|
39
|
+
print()
|
|
40
|
+
|
|
41
|
+
# Test 4: Multiple generations
|
|
42
|
+
print('✓ Test 4: Multiple generations (n=3)')
|
|
43
|
+
results = lm.generate('AI is', max_tokens=10, n=3, temperature=0.8)
|
|
44
|
+
print(f' Input: "AI is"')
|
|
45
|
+
for i, r in enumerate(results, 1):
|
|
46
|
+
print(f' {i}. {r["text"][:40]}')
|
|
47
|
+
# Note: Server may return fewer choices than requested
|
|
48
|
+
assert isinstance(results, list)
|
|
49
|
+
assert len(results) >= 1
|
|
50
|
+
print()
|
|
51
|
+
|
|
52
|
+
# Test 5: Temperature control
|
|
53
|
+
print('✓ Test 5: Temperature control')
|
|
54
|
+
low_temp = lm.generate('1 + 1 =', max_tokens=5, temperature=0.1, seed=42)
|
|
55
|
+
high_temp = lm.generate('1 + 1 =', max_tokens=5, temperature=1.5, seed=123)
|
|
56
|
+
print(f' Low temp (0.1): {low_temp["text"][:30]}')
|
|
57
|
+
print(f' High temp (1.5): {high_temp["text"][:30]}')
|
|
58
|
+
print()
|
|
59
|
+
|
|
60
|
+
# Test 6: Stop sequences
|
|
61
|
+
print('✓ Test 6: Stop sequences')
|
|
62
|
+
result = lm.generate(
|
|
63
|
+
'Count: 1, 2, 3,',
|
|
64
|
+
max_tokens=50,
|
|
65
|
+
stop=[', 6', '\n'],
|
|
66
|
+
temperature=0.7,
|
|
67
|
+
)
|
|
68
|
+
print(f' Input: "Count: 1, 2, 3,"')
|
|
69
|
+
print(f' Output (stop at ", 6"): {result["text"][:50]}')
|
|
70
|
+
print()
|
|
71
|
+
|
|
72
|
+
# Test 7: Seed reproducibility
|
|
73
|
+
print('✓ Test 7: Seed reproducibility')
|
|
74
|
+
r1 = lm.generate('Test', max_tokens=10, seed=999, temperature=0.8)
|
|
75
|
+
r2 = lm.generate('Test', max_tokens=10, seed=999, temperature=0.8)
|
|
76
|
+
print(f' Run 1: {r1["text"][:40]}')
|
|
77
|
+
print(f' Run 2: {r2["text"][:40]}')
|
|
78
|
+
print(f' Match: {r1["text"] == r2["text"]}')
|
|
79
|
+
print()
|
|
80
|
+
|
|
81
|
+
print('=' * 60)
|
|
82
|
+
print('✅ All tests passed!')
|
|
83
|
+
print('=' * 60)
|
|
84
|
+
print()
|
|
85
|
+
print('Available methods:')
|
|
86
|
+
print(' - lm.encode(text) -> list[int]')
|
|
87
|
+
print(' - lm.decode(token_ids) -> str')
|
|
88
|
+
print(' - lm.generate(input, **kwargs) -> dict')
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Test the generate method with HuggingFace-style interface."""
|
|
2
|
+
|
|
3
|
+
from llm_utils.lm import LLM
|
|
4
|
+
|
|
5
|
+
# Initialize LLM
|
|
6
|
+
lm = LLM(client=8000)
|
|
7
|
+
|
|
8
|
+
print('=' * 60)
|
|
9
|
+
print('Test 1: Basic text generation')
|
|
10
|
+
print('=' * 60)
|
|
11
|
+
result = lm.generate(
|
|
12
|
+
'The capital of France is',
|
|
13
|
+
max_tokens=50,
|
|
14
|
+
temperature=0.7,
|
|
15
|
+
)
|
|
16
|
+
print(f"Input: 'The capital of France is'")
|
|
17
|
+
print(f"Generated: {result.get('text', 'N/A')}")
|
|
18
|
+
print()
|
|
19
|
+
|
|
20
|
+
print('=' * 60)
|
|
21
|
+
print('Test 2: Generation with token IDs input')
|
|
22
|
+
print('=' * 60)
|
|
23
|
+
# Encode input first
|
|
24
|
+
input_text = 'Hello, how are you?'
|
|
25
|
+
token_ids = lm.encode(input_text)
|
|
26
|
+
print(f'Input text: {input_text}')
|
|
27
|
+
print(f'Token IDs: {token_ids}')
|
|
28
|
+
|
|
29
|
+
result = lm.generate(
|
|
30
|
+
token_ids,
|
|
31
|
+
max_tokens=30,
|
|
32
|
+
temperature=0.8,
|
|
33
|
+
return_token_ids=True,
|
|
34
|
+
)
|
|
35
|
+
print(f"Generated text: {result.get('text', 'N/A')}")
|
|
36
|
+
print(f"Generated token IDs: {result.get('token_ids', 'N/A')}")
|
|
37
|
+
print()
|
|
38
|
+
|
|
39
|
+
print('=' * 60)
|
|
40
|
+
print('Test 3: Multiple generations (n=3)')
|
|
41
|
+
print('=' * 60)
|
|
42
|
+
results = lm.generate(
|
|
43
|
+
'Once upon a time',
|
|
44
|
+
max_tokens=30,
|
|
45
|
+
temperature=0.9,
|
|
46
|
+
n=3,
|
|
47
|
+
)
|
|
48
|
+
for i, result in enumerate(results, 1):
|
|
49
|
+
print(f"Generation {i}: {result.get('text', 'N/A')}")
|
|
50
|
+
print()
|
|
51
|
+
|
|
52
|
+
print('=' * 60)
|
|
53
|
+
print('Test 4: Generation with sampling parameters')
|
|
54
|
+
print('=' * 60)
|
|
55
|
+
result = lm.generate(
|
|
56
|
+
'The best programming language is',
|
|
57
|
+
max_tokens=40,
|
|
58
|
+
temperature=0.5,
|
|
59
|
+
top_p=0.9,
|
|
60
|
+
top_k=50,
|
|
61
|
+
repetition_penalty=1.1,
|
|
62
|
+
)
|
|
63
|
+
print(f"Generated: {result.get('text', 'N/A')}")
|
|
64
|
+
print()
|
|
65
|
+
|
|
66
|
+
print('=' * 60)
|
|
67
|
+
print('Test 5: Generation with stop sequences')
|
|
68
|
+
print('=' * 60)
|
|
69
|
+
result = lm.generate(
|
|
70
|
+
'List three colors:\n1.',
|
|
71
|
+
max_tokens=100,
|
|
72
|
+
temperature=0.7,
|
|
73
|
+
stop=['\n4.', 'That'],
|
|
74
|
+
)
|
|
75
|
+
print(f"Generated: {result.get('text', 'N/A')}")
|
|
76
|
+
print()
|
|
77
|
+
|
|
78
|
+
print('✓ All tests completed!')
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Check which endpoint path works for generate."""
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
# Test different endpoint paths
|
|
6
|
+
test_urls = [
|
|
7
|
+
'http://localhost:8000/inference/v1/generate',
|
|
8
|
+
'http://localhost:8000/v1/inference/v1/generate',
|
|
9
|
+
'http://localhost:8000/generate',
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
# Dummy request payload
|
|
13
|
+
dummy_payload = {
|
|
14
|
+
'token_ids': [1, 2, 3],
|
|
15
|
+
'sampling_params': {
|
|
16
|
+
'max_tokens': 10,
|
|
17
|
+
'temperature': 1.0,
|
|
18
|
+
},
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
for url in test_urls:
|
|
22
|
+
try:
|
|
23
|
+
response = requests.post(url, json=dummy_payload)
|
|
24
|
+
print(f'✓ {url} - Status: {response.status_code}')
|
|
25
|
+
if response.status_code in [200, 400, 422]: # 400/422 might be validation error but endpoint exists
|
|
26
|
+
print(f' Endpoint exists!')
|
|
27
|
+
except Exception as e:
|
|
28
|
+
print(f'✗ {url} - Error: {type(e).__name__}')
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Quick Reference: generate() Method
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
|
|
5
|
+
The `LLM.generate()` method provides a HuggingFace Transformers-style interface for low-level text generation, working directly with token IDs.
|
|
6
|
+
|
|
7
|
+
## Basic Signature
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
lm.generate(
|
|
11
|
+
input_context: str | list[int],
|
|
12
|
+
max_tokens=512,
|
|
13
|
+
temperature=1.0,
|
|
14
|
+
n=1,
|
|
15
|
+
**kwargs
|
|
16
|
+
) -> dict | list[dict]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Common Use Cases
|
|
20
|
+
|
|
21
|
+
### 1. Simple Generation
|
|
22
|
+
```python
|
|
23
|
+
result = lm.generate('Hello world', max_tokens=50)
|
|
24
|
+
print(result['text'])
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### 2. From Token IDs
|
|
28
|
+
```python
|
|
29
|
+
token_ids = lm.encode('Hello')
|
|
30
|
+
result = lm.generate(token_ids, max_tokens=50, return_token_ids=True)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### 3. Multiple Outputs
|
|
34
|
+
```python
|
|
35
|
+
results = lm.generate('Start:', max_tokens=30, n=5) # 5 different completions
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### 4. Temperature Control
|
|
39
|
+
```python
|
|
40
|
+
# Deterministic (low temp)
|
|
41
|
+
result = lm.generate(prompt, temperature=0.1)
|
|
42
|
+
|
|
43
|
+
# Creative (high temp)
|
|
44
|
+
result = lm.generate(prompt, temperature=1.5)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 5. Advanced Sampling
|
|
48
|
+
```python
|
|
49
|
+
result = lm.generate(
|
|
50
|
+
prompt,
|
|
51
|
+
temperature=0.8,
|
|
52
|
+
top_k=50,
|
|
53
|
+
top_p=0.95,
|
|
54
|
+
repetition_penalty=1.2,
|
|
55
|
+
)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 6. Stop Sequences
|
|
59
|
+
```python
|
|
60
|
+
result = lm.generate(
|
|
61
|
+
'List:\n1.',
|
|
62
|
+
max_tokens=200,
|
|
63
|
+
stop=['\n\n', 'End'],
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### 7. Reproducible
|
|
68
|
+
```python
|
|
69
|
+
result = lm.generate(prompt, seed=42) # Same seed = same output
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Parameter Guide
|
|
73
|
+
|
|
74
|
+
| Parameter | Type | Default | Description |
|
|
75
|
+
|-----------|------|---------|-------------|
|
|
76
|
+
| `input_context` | str \| list[int] | required | Text or token IDs |
|
|
77
|
+
| `max_tokens` | int | 512 | Max tokens to generate |
|
|
78
|
+
| `temperature` | float | 1.0 | Randomness (0.0-2.0) |
|
|
79
|
+
| `top_p` | float | 1.0 | Nucleus sampling |
|
|
80
|
+
| `top_k` | int | -1 | Top-k sampling (-1=off) |
|
|
81
|
+
| `n` | int | 1 | Number of completions |
|
|
82
|
+
| `stop` | str \| list | None | Stop sequences |
|
|
83
|
+
| `seed` | int | None | Random seed |
|
|
84
|
+
| `repetition_penalty` | float | 1.0 | Repeat penalty (1.0=off) |
|
|
85
|
+
| `return_token_ids` | bool | False | Include token IDs |
|
|
86
|
+
| `return_text` | bool | True | Include text |
|
|
87
|
+
|
|
88
|
+
## Return Format
|
|
89
|
+
|
|
90
|
+
Single generation (n=1):
|
|
91
|
+
```python
|
|
92
|
+
{
|
|
93
|
+
'text': 'generated text...',
|
|
94
|
+
'token_ids': [1, 2, 3, ...], # if return_token_ids=True
|
|
95
|
+
'finish_reason': 'length',
|
|
96
|
+
'_raw_response': {...}
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Multiple generations (n>1):
|
|
101
|
+
```python
|
|
102
|
+
[
|
|
103
|
+
{'text': '...', 'finish_reason': '...'},
|
|
104
|
+
{'text': '...', 'finish_reason': '...'},
|
|
105
|
+
...
|
|
106
|
+
]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Comparison to HuggingFace
|
|
110
|
+
|
|
111
|
+
| HuggingFace | llm_utils | Notes |
|
|
112
|
+
|-------------|-----------|-------|
|
|
113
|
+
| `model.generate(input_ids=...)` | `lm.generate(token_ids)` | Same concept |
|
|
114
|
+
| `max_length` | `max_tokens` | Different naming |
|
|
115
|
+
| `num_return_sequences` | `n` | Different naming |
|
|
116
|
+
| `do_sample=True` | `temperature > 0` | Auto-enabled |
|
|
117
|
+
| `num_beams` | N/A | Not supported |
|
|
118
|
+
|
|
119
|
+
## Tips
|
|
120
|
+
|
|
121
|
+
1. **Token Counting**: Use `len(lm.encode(text))` to count tokens before generating
|
|
122
|
+
2. **Reproducibility**: Set `seed` for deterministic output
|
|
123
|
+
3. **Quality vs Speed**: Lower temperature for quality, higher for creativity
|
|
124
|
+
4. **Stop Early**: Use `stop` sequences to control output format
|
|
125
|
+
5. **Debug**: Check `result['_raw_response']` for full API response
|