bulk-chain 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/PKG-INFO +28 -9
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/README.md +27 -7
- bulk_chain-1.1.0/bulk_chain/api.py +186 -0
- bulk_chain-1.1.0/bulk_chain/core/llm_base.py +24 -0
- bulk_chain-1.1.0/bulk_chain/core/service_asyncio.py +65 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_batch.py +2 -2
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/utils.py +21 -2
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/PKG-INFO +28 -9
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/SOURCES.txt +4 -4
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/setup.py +1 -2
- bulk_chain-1.1.0/test/test_api.py +67 -0
- bulk_chain-1.0.0/test/test_provider_batching.py → bulk_chain-1.1.0/test/test_api_batching.py +3 -5
- bulk_chain-1.1.0/test/test_api_streaming.py +21 -0
- bulk_chain-1.1.0/test/test_replicate_async_baseline.py +11 -0
- bulk_chain-1.1.0/test/test_replicate_async_batch_async.py +37 -0
- bulk_chain-1.0.0/bulk_chain/api.py +0 -143
- bulk_chain-1.0.0/bulk_chain/core/llm_base.py +0 -50
- bulk_chain-1.0.0/bulk_chain.egg-info/requires.txt +0 -1
- bulk_chain-1.0.0/test/test.py +0 -62
- bulk_chain-1.0.0/test/test_api.py +0 -34
- bulk_chain-1.0.0/test/test_api_streaming.py +0 -52
- bulk_chain-1.0.0/test/test_args_seeking.py +0 -26
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/LICENSE +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/__init__.py +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/__init__.py +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_data.py +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_dict.py +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_json.py +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_schema.py +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/dependency_links.txt +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/top_level.txt +0 -0
- {bulk_chain-1.0.0 → bulk_chain-1.1.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bulk_chain
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
5
|
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -15,9 +15,8 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
15
15
|
Requires-Python: >=3.6
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
-
Requires-Dist: tqdm
|
|
19
18
|
|
|
20
|
-
# bulk-chain 1.
|
|
19
|
+
# bulk-chain 1.1.0
|
|
21
20
|

|
|
22
21
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
23
22
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -78,16 +77,36 @@ Below, is an example on how to declare your own schema:
|
|
|
78
77
|
|
|
79
78
|
# Usage
|
|
80
79
|
|
|
81
|
-
|
|
80
|
+
## 🤖 Prepare
|
|
82
81
|
|
|
83
|
-
1.
|
|
84
|
-
|
|
82
|
+
1. [schema](#chain-of-thought-schema)
|
|
83
|
+
* [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
|
|
84
|
+
2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
85
|
+
3. Data (iter of dictionaries)
|
|
85
86
|
|
|
87
|
+
## 🚀 Launch
|
|
86
88
|
|
|
89
|
+
> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
87
90
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
+
```python
|
|
92
|
+
from bulk_chain.core.utils import dynamic_init
|
|
93
|
+
from bulk_chain.api import iter_content
|
|
94
|
+
|
|
95
|
+
content_it = iter_content(
|
|
96
|
+
# 1. Your schema.
|
|
97
|
+
schema="YOUR_SCHEMA.json",
|
|
98
|
+
# 2. Your third-party model implementation.
|
|
99
|
+
llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
|
|
100
|
+
# 3. Customize your inference and result providing modes:
|
|
101
|
+
infer_mode="batch_async",
|
|
102
|
+
return_mode="batch",
|
|
103
|
+
# 4. Your iterator of dictionaries
|
|
104
|
+
input_dicts_it=YOUR_DATA_IT,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
for content in content_it:
|
|
108
|
+
# Handle your LLM responses here ...
|
|
109
|
+
```
|
|
91
110
|
|
|
92
111
|
|
|
93
112
|
# Embed your LLM
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# bulk-chain 1.
|
|
1
|
+
# bulk-chain 1.1.0
|
|
2
2
|

|
|
3
3
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
4
4
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -59,16 +59,36 @@ Below, is an example on how to declare your own schema:
|
|
|
59
59
|
|
|
60
60
|
# Usage
|
|
61
61
|
|
|
62
|
-
|
|
62
|
+
## 🤖 Prepare
|
|
63
63
|
|
|
64
|
-
1.
|
|
65
|
-
|
|
64
|
+
1. [schema](#chain-of-thought-schema)
|
|
65
|
+
* [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
|
|
66
|
+
2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
67
|
+
3. Data (iter of dictionaries)
|
|
66
68
|
|
|
69
|
+
## 🚀 Launch
|
|
67
70
|
|
|
71
|
+
> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
68
72
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
73
|
+
```python
|
|
74
|
+
from bulk_chain.core.utils import dynamic_init
|
|
75
|
+
from bulk_chain.api import iter_content
|
|
76
|
+
|
|
77
|
+
content_it = iter_content(
|
|
78
|
+
# 1. Your schema.
|
|
79
|
+
schema="YOUR_SCHEMA.json",
|
|
80
|
+
# 2. Your third-party model implementation.
|
|
81
|
+
llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
|
|
82
|
+
# 3. Customize your inference and result providing modes:
|
|
83
|
+
infer_mode="batch_async",
|
|
84
|
+
return_mode="batch",
|
|
85
|
+
# 4. Your iterator of dictionaries
|
|
86
|
+
input_dicts_it=YOUR_DATA_IT,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
for content in content_it:
|
|
90
|
+
# Handle your LLM responses here ...
|
|
91
|
+
```
|
|
72
92
|
|
|
73
93
|
|
|
74
94
|
# Embed your LLM
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import collections
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from itertools import chain
|
|
6
|
+
|
|
7
|
+
from bulk_chain.core.llm_base import BaseLM
|
|
8
|
+
from bulk_chain.core.service_asyncio import AsyncioService
|
|
9
|
+
from bulk_chain.core.service_batch import BatchIterator
|
|
10
|
+
from bulk_chain.core.service_data import DataService
|
|
11
|
+
from bulk_chain.core.service_dict import DictionaryService
|
|
12
|
+
from bulk_chain.core.service_json import JsonService
|
|
13
|
+
from bulk_chain.core.service_schema import SchemaService
|
|
14
|
+
from bulk_chain.core.utils import attempt_wrapper
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
INFER_MODES = {
|
|
18
|
+
"single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
|
|
19
|
+
"single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
|
|
20
|
+
"batch": lambda llm, batch, **kwargs: llm.ask(batch),
|
|
21
|
+
"batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
|
|
22
|
+
batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
|
|
23
|
+
),
|
|
24
|
+
"batch_stream_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
|
|
25
|
+
batch=batch, async_handler=llm.ask_stream_async, event_loop=kwargs.get("event_loop")
|
|
26
|
+
),
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
CWD = os.getcwd()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _iter_batch_prompts(c, batch_content_it, **kwargs):
|
|
34
|
+
for ind_in_batch, entry in enumerate(batch_content_it):
|
|
35
|
+
content = DataService.get_prompt_text(
|
|
36
|
+
prompt=entry[c]["prompt"],
|
|
37
|
+
data_dict=entry,
|
|
38
|
+
handle_missed_func=kwargs["handle_missed_value_func"])
|
|
39
|
+
yield ind_in_batch, content
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def __handle_agen_to_gen(handle, batch, event_loop):
|
|
43
|
+
""" This handler provides conversion of the async generator to generator (sync).
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __wrap_with_index(async_gens):
|
|
47
|
+
async def wrapper(index, agen):
|
|
48
|
+
async for item in agen:
|
|
49
|
+
yield index, item
|
|
50
|
+
return [wrapper(i, agen) for i, agen in enumerate(async_gens)]
|
|
51
|
+
|
|
52
|
+
agen_list = handle(batch, event_loop=event_loop)
|
|
53
|
+
|
|
54
|
+
it = AsyncioService.async_gen_to_iter(
|
|
55
|
+
gen=AsyncioService.merge_generators(*__wrap_with_index(agen_list)),
|
|
56
|
+
loop=event_loop)
|
|
57
|
+
|
|
58
|
+
for ind_in_batch, chunk in it:
|
|
59
|
+
yield ind_in_batch, str(chunk)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def __handle_gen(handle, batch, event_loop):
|
|
63
|
+
""" This handler deals with the iteration of each individual element of the batch.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def _iter_entry_content(entry):
|
|
67
|
+
if isinstance(entry, str):
|
|
68
|
+
yield entry
|
|
69
|
+
elif isinstance(entry, collections.abc.Iterable):
|
|
70
|
+
for chunk in map(lambda item: str(item), entry):
|
|
71
|
+
yield chunk
|
|
72
|
+
else:
|
|
73
|
+
raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
|
|
74
|
+
|
|
75
|
+
for ind_in_batch, entry in enumerate(handle(batch, event_loop=event_loop)):
|
|
76
|
+
for chunk in _iter_entry_content(entry=entry):
|
|
77
|
+
yield ind_in_batch, chunk
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _iter_chunks(p_column, batch_content_it, **kwargs):
|
|
81
|
+
handler = __handle_agen_to_gen if kwargs["infer_mode"] == "batch_stream_async" else __handle_gen
|
|
82
|
+
p_batch = [item[p_column] for item in batch_content_it]
|
|
83
|
+
it = handler(handle=kwargs["handle_batch_func"], batch=p_batch, event_loop=kwargs["event_loop"])
|
|
84
|
+
for ind_in_batch, chunk in it:
|
|
85
|
+
yield ind_in_batch, chunk
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
|
|
89
|
+
assert (isinstance(batch, list))
|
|
90
|
+
|
|
91
|
+
if len(batch) == 0:
|
|
92
|
+
return batch
|
|
93
|
+
|
|
94
|
+
if cols is None:
|
|
95
|
+
first_item = batch[0]
|
|
96
|
+
cols = list(first_item.keys()) if cols is None else cols
|
|
97
|
+
|
|
98
|
+
for c in cols:
|
|
99
|
+
|
|
100
|
+
# Handling prompt column.
|
|
101
|
+
if c in schema.p2r:
|
|
102
|
+
content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
|
|
103
|
+
for ind_in_batch, prompt in content_it:
|
|
104
|
+
batch[ind_in_batch][c] = prompt
|
|
105
|
+
|
|
106
|
+
# Handling column for inference.
|
|
107
|
+
if c in schema.r2p:
|
|
108
|
+
content_it = _iter_chunks(p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
|
|
109
|
+
# Register values.
|
|
110
|
+
for item in batch:
|
|
111
|
+
item[c] = []
|
|
112
|
+
for ind_in_batch, chunk in content_it:
|
|
113
|
+
# Append batch.
|
|
114
|
+
batch[ind_in_batch][c].append(chunk)
|
|
115
|
+
# Returning (optional).
|
|
116
|
+
if return_mode == "chunk":
|
|
117
|
+
global_ind = batch_ind * len(batch) + ind_in_batch
|
|
118
|
+
yield [global_ind, c, chunk]
|
|
119
|
+
|
|
120
|
+
# Convert content to string.
|
|
121
|
+
for item in batch:
|
|
122
|
+
item[c] = "".join(item[c])
|
|
123
|
+
|
|
124
|
+
if return_mode == "record":
|
|
125
|
+
for record in batch:
|
|
126
|
+
yield record
|
|
127
|
+
|
|
128
|
+
if return_mode == "batch":
|
|
129
|
+
yield batch
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
|
|
133
|
+
infer_mode="batch", return_mode="batch", attempts=1, event_loop=None,
|
|
134
|
+
**kwargs):
|
|
135
|
+
""" This method represent Python API aimed at application of `llm` towards
|
|
136
|
+
iterator of input_dicts via cache_target that refers to the SQLite using
|
|
137
|
+
the given `schema`
|
|
138
|
+
"""
|
|
139
|
+
assert (infer_mode in INFER_MODES.keys())
|
|
140
|
+
assert (return_mode in ["batch", "chunk", "record"])
|
|
141
|
+
assert (isinstance(llm, BaseLM))
|
|
142
|
+
|
|
143
|
+
# Setup event loop.
|
|
144
|
+
event_loop = asyncio.get_event_loop_policy().get_event_loop() \
|
|
145
|
+
if event_loop is None else event_loop
|
|
146
|
+
|
|
147
|
+
# Quick initialization of the schema.
|
|
148
|
+
if isinstance(schema, str):
|
|
149
|
+
schema = JsonService.read(schema)
|
|
150
|
+
if isinstance(schema, dict):
|
|
151
|
+
schema = SchemaService(json_data=schema)
|
|
152
|
+
|
|
153
|
+
prompts_it = map(
|
|
154
|
+
lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
|
|
155
|
+
input_dicts_it
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
handle_batch_func = lambda batch, **handle_kwargs: INFER_MODES[infer_mode](
|
|
159
|
+
llm,
|
|
160
|
+
DataService.limit_prompts(batch, limit=limit_prompt),
|
|
161
|
+
**handle_kwargs
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Optional wrapping into attempts.
|
|
165
|
+
if attempts > 1:
|
|
166
|
+
# Optional setup of the logger.
|
|
167
|
+
logger = logging.getLogger(__name__)
|
|
168
|
+
logging.basicConfig(level=logging.INFO)
|
|
169
|
+
|
|
170
|
+
attempt_dec = attempt_wrapper(attempts=attempts,
|
|
171
|
+
delay_sec=kwargs.get("attempt_delay_sec", 1),
|
|
172
|
+
logger=logger)
|
|
173
|
+
handle_batch_func = attempt_dec(handle_batch_func)
|
|
174
|
+
|
|
175
|
+
content_it = (_infer_batch(batch=batch,
|
|
176
|
+
batch_ind=batch_ind,
|
|
177
|
+
infer_mode=infer_mode,
|
|
178
|
+
handle_batch_func=handle_batch_func,
|
|
179
|
+
handle_missed_value_func=lambda *_: None,
|
|
180
|
+
return_mode=return_mode,
|
|
181
|
+
schema=schema,
|
|
182
|
+
event_loop=event_loop,
|
|
183
|
+
**kwargs)
|
|
184
|
+
for batch_ind, batch in enumerate(BatchIterator(prompts_it, batch_size=batch_size)))
|
|
185
|
+
|
|
186
|
+
yield from chain.from_iterable(content_it)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
class BaseLM(object):
|
|
2
|
+
|
|
3
|
+
def __init__(self, **kwargs):
|
|
4
|
+
pass
|
|
5
|
+
|
|
6
|
+
def ask(self, content):
|
|
7
|
+
""" Assumes to return str.
|
|
8
|
+
"""
|
|
9
|
+
raise NotImplemented()
|
|
10
|
+
|
|
11
|
+
def ask_stream(self, content):
|
|
12
|
+
""" Assumes to return generator.
|
|
13
|
+
"""
|
|
14
|
+
raise NotImplemented()
|
|
15
|
+
|
|
16
|
+
async def ask_async(self, prompt):
|
|
17
|
+
""" Assumes to return co-routine.
|
|
18
|
+
"""
|
|
19
|
+
raise NotImplemented()
|
|
20
|
+
|
|
21
|
+
async def ask_stream_async(self, batch):
|
|
22
|
+
""" Assumes to return AsyncGenerator.
|
|
23
|
+
"""
|
|
24
|
+
raise NotImplemented()
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import AsyncGenerator, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class AsyncioService:
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
async def _run_tasks_async(batch, async_handler):
|
|
9
|
+
tasks = [async_handler(prompt) for prompt in batch]
|
|
10
|
+
return await asyncio.gather(*tasks)
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
async def _run_generator(gen, output_queue, idx):
|
|
14
|
+
try:
|
|
15
|
+
async for item in gen:
|
|
16
|
+
await output_queue.put((idx, item))
|
|
17
|
+
finally:
|
|
18
|
+
await output_queue.put((idx, StopAsyncIteration))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def run_tasks(event_loop, **tasks_kwargs):
|
|
23
|
+
return event_loop.run_until_complete(AsyncioService._run_tasks_async(**tasks_kwargs))
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
async def merge_generators(*gens: AsyncGenerator[Any, None]) -> AsyncGenerator[Any, None]:
|
|
27
|
+
|
|
28
|
+
output_queue = asyncio.Queue()
|
|
29
|
+
tasks = [
|
|
30
|
+
asyncio.create_task(AsyncioService._run_generator(gen, output_queue, idx))
|
|
31
|
+
for idx, gen in enumerate(gens)
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
finished = set()
|
|
35
|
+
while len(finished) < len(tasks):
|
|
36
|
+
idx, item = await output_queue.get()
|
|
37
|
+
if item is StopAsyncIteration:
|
|
38
|
+
finished.add(idx)
|
|
39
|
+
else:
|
|
40
|
+
yield item
|
|
41
|
+
|
|
42
|
+
for task in tasks:
|
|
43
|
+
task.cancel()
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def async_gen_to_iter(gen, loop=None):
|
|
47
|
+
""" This approach is limited. Could be considered as legacy.
|
|
48
|
+
https://stackoverflow.com/questions/71580727/translating-async-generator-into-sync-one/78573267#78573267
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
loop_created = False
|
|
52
|
+
if loop is None:
|
|
53
|
+
loop_created = True
|
|
54
|
+
loop = asyncio.new_event_loop()
|
|
55
|
+
|
|
56
|
+
asyncio.set_event_loop(loop)
|
|
57
|
+
try:
|
|
58
|
+
while True:
|
|
59
|
+
try:
|
|
60
|
+
yield loop.run_until_complete(gen.__anext__())
|
|
61
|
+
except StopAsyncIteration:
|
|
62
|
+
break
|
|
63
|
+
finally:
|
|
64
|
+
if loop_created:
|
|
65
|
+
loop.close()
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
class BatchIterator:
|
|
2
2
|
|
|
3
3
|
def __init__(self, data_iter, batch_size, end_value=None, filter_func=None):
|
|
4
|
-
assert(isinstance(batch_size, int) and batch_size > 0)
|
|
5
|
-
assert(callable(end_value) or end_value is None)
|
|
4
|
+
assert (isinstance(batch_size, int) and batch_size > 0)
|
|
5
|
+
assert (callable(end_value) or end_value is None)
|
|
6
6
|
self.__data_iter = data_iter
|
|
7
7
|
self.__index = 0
|
|
8
8
|
self.__batch_size = batch_size
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import importlib
|
|
2
2
|
import logging
|
|
3
3
|
import sys
|
|
4
|
+
import time
|
|
4
5
|
from collections import Counter
|
|
5
6
|
from os.path import dirname, join, basename
|
|
6
7
|
|
|
@@ -60,10 +61,10 @@ def auto_import(name, is_class=False):
|
|
|
60
61
|
return m() if is_class else m
|
|
61
62
|
|
|
62
63
|
|
|
63
|
-
def dynamic_init(
|
|
64
|
+
def dynamic_init(class_filepath, class_name=None):
|
|
64
65
|
|
|
65
66
|
# Registering path.
|
|
66
|
-
target = join(
|
|
67
|
+
target = join(dirname(class_filepath))
|
|
67
68
|
logger.info(f"Adding sys path for `{target}`")
|
|
68
69
|
sys.path.insert(1, target)
|
|
69
70
|
class_path_list = class_filepath.split('/')
|
|
@@ -89,3 +90,21 @@ def optional_limit_iter(it_data, limit=None):
|
|
|
89
90
|
if limit is not None and counter["returned"] > limit:
|
|
90
91
|
break
|
|
91
92
|
yield data
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def attempt_wrapper(attempts, delay_sec=1, logger=None):
|
|
96
|
+
def decorator(func):
|
|
97
|
+
def wrapper(*args, **kwargs):
|
|
98
|
+
for i in range(attempts):
|
|
99
|
+
try:
|
|
100
|
+
# Do action.
|
|
101
|
+
return func(*args, **kwargs)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
if logger is not None:
|
|
104
|
+
logger.info(f"Unable to infer the result. Try {i} out of {attempts}.")
|
|
105
|
+
logger.info(e)
|
|
106
|
+
if delay_sec is not None:
|
|
107
|
+
time.sleep(delay_sec)
|
|
108
|
+
raise Exception(f"Failed after {attempts} attempts")
|
|
109
|
+
return wrapper
|
|
110
|
+
return decorator
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bulk_chain
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
5
|
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -15,9 +15,8 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
15
15
|
Requires-Python: >=3.6
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
-
Requires-Dist: tqdm
|
|
19
18
|
|
|
20
|
-
# bulk-chain 1.
|
|
19
|
+
# bulk-chain 1.1.0
|
|
21
20
|

|
|
22
21
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
23
22
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -78,16 +77,36 @@ Below, is an example on how to declare your own schema:
|
|
|
78
77
|
|
|
79
78
|
# Usage
|
|
80
79
|
|
|
81
|
-
|
|
80
|
+
## 🤖 Prepare
|
|
82
81
|
|
|
83
|
-
1.
|
|
84
|
-
|
|
82
|
+
1. [schema](#chain-of-thought-schema)
|
|
83
|
+
* [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
|
|
84
|
+
2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
85
|
+
3. Data (iter of dictionaries)
|
|
85
86
|
|
|
87
|
+
## 🚀 Launch
|
|
86
88
|
|
|
89
|
+
> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
87
90
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
+
```python
|
|
92
|
+
from bulk_chain.core.utils import dynamic_init
|
|
93
|
+
from bulk_chain.api import iter_content
|
|
94
|
+
|
|
95
|
+
content_it = iter_content(
|
|
96
|
+
# 1. Your schema.
|
|
97
|
+
schema="YOUR_SCHEMA.json",
|
|
98
|
+
# 2. Your third-party model implementation.
|
|
99
|
+
llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
|
|
100
|
+
# 3. Customize your inference and result providing modes:
|
|
101
|
+
infer_mode="batch_async",
|
|
102
|
+
return_mode="batch",
|
|
103
|
+
# 4. Your iterator of dictionaries
|
|
104
|
+
input_dicts_it=YOUR_DATA_IT,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
for content in content_it:
|
|
108
|
+
# Handle your LLM responses here ...
|
|
109
|
+
```
|
|
91
110
|
|
|
92
111
|
|
|
93
112
|
# Embed your LLM
|
|
@@ -6,18 +6,18 @@ bulk_chain/api.py
|
|
|
6
6
|
bulk_chain.egg-info/PKG-INFO
|
|
7
7
|
bulk_chain.egg-info/SOURCES.txt
|
|
8
8
|
bulk_chain.egg-info/dependency_links.txt
|
|
9
|
-
bulk_chain.egg-info/requires.txt
|
|
10
9
|
bulk_chain.egg-info/top_level.txt
|
|
11
10
|
bulk_chain/core/__init__.py
|
|
12
11
|
bulk_chain/core/llm_base.py
|
|
12
|
+
bulk_chain/core/service_asyncio.py
|
|
13
13
|
bulk_chain/core/service_batch.py
|
|
14
14
|
bulk_chain/core/service_data.py
|
|
15
15
|
bulk_chain/core/service_dict.py
|
|
16
16
|
bulk_chain/core/service_json.py
|
|
17
17
|
bulk_chain/core/service_schema.py
|
|
18
18
|
bulk_chain/core/utils.py
|
|
19
|
-
test/test.py
|
|
20
19
|
test/test_api.py
|
|
20
|
+
test/test_api_batching.py
|
|
21
21
|
test/test_api_streaming.py
|
|
22
|
-
test/
|
|
23
|
-
test/
|
|
22
|
+
test/test_replicate_async_baseline.py
|
|
23
|
+
test/test_replicate_async_batch_async.py
|
|
@@ -15,7 +15,7 @@ def get_requirements(filenames):
|
|
|
15
15
|
|
|
16
16
|
setup(
|
|
17
17
|
name='bulk_chain',
|
|
18
|
-
version='1.
|
|
18
|
+
version='1.1.0',
|
|
19
19
|
python_requires=">=3.6",
|
|
20
20
|
description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
|
|
21
21
|
'ensuring reliable results for bulk input requests.',
|
|
@@ -36,5 +36,4 @@ setup(
|
|
|
36
36
|
'chain-of-thought, '
|
|
37
37
|
'reasoning',
|
|
38
38
|
packages=find_packages(),
|
|
39
|
-
install_requires=get_requirements(['dependencies.txt'])
|
|
40
39
|
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from os.path import join
|
|
3
|
+
|
|
4
|
+
from bulk_chain.api import iter_content
|
|
5
|
+
from utils import current_dir, DEFAULT_REMOTE_LLM
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestAPI(unittest.TestCase):
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def it_data(n):
|
|
13
|
+
for i in range(n):
|
|
14
|
+
yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
|
|
15
|
+
|
|
16
|
+
def test_single(self):
|
|
17
|
+
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
18
|
+
llm=DEFAULT_REMOTE_LLM,
|
|
19
|
+
batch_size=1,
|
|
20
|
+
infer_mode="single",
|
|
21
|
+
return_mode="batch",
|
|
22
|
+
schema=join(current_dir, "schema/default.json"))
|
|
23
|
+
|
|
24
|
+
for data in data_it:
|
|
25
|
+
print(data)
|
|
26
|
+
|
|
27
|
+
def test_single_stream(self):
|
|
28
|
+
""" Returns individual chunks.
|
|
29
|
+
"""
|
|
30
|
+
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
31
|
+
llm=DEFAULT_REMOTE_LLM,
|
|
32
|
+
batch_size=1,
|
|
33
|
+
infer_mode="single_stream",
|
|
34
|
+
return_mode="chunk",
|
|
35
|
+
schema=join(current_dir, "schema/default.json"))
|
|
36
|
+
|
|
37
|
+
for data in data_it:
|
|
38
|
+
print(data)
|
|
39
|
+
|
|
40
|
+
def test_batch_async(self):
|
|
41
|
+
""" Return batches that passed async at the Replicate.
|
|
42
|
+
"""
|
|
43
|
+
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
44
|
+
llm=DEFAULT_REMOTE_LLM,
|
|
45
|
+
batch_size=5,
|
|
46
|
+
infer_mode="batch_async",
|
|
47
|
+
return_mode="batch",
|
|
48
|
+
schema=join(current_dir, "schema/default.json"))
|
|
49
|
+
|
|
50
|
+
for batch in data_it:
|
|
51
|
+
for item in batch:
|
|
52
|
+
print(item)
|
|
53
|
+
|
|
54
|
+
def test_batch_stream_async(self):
|
|
55
|
+
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
56
|
+
llm=DEFAULT_REMOTE_LLM,
|
|
57
|
+
batch_size=5,
|
|
58
|
+
infer_mode="batch_stream_async",
|
|
59
|
+
return_mode="chunk",
|
|
60
|
+
schema=join(current_dir, "schema/default.json"))
|
|
61
|
+
|
|
62
|
+
for chunk_info in data_it:
|
|
63
|
+
print(chunk_info)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if __name__ == '__main__':
|
|
67
|
+
unittest.main()
|
bulk_chain-1.0.0/test/test_provider_batching.py → bulk_chain-1.1.0/test/test_api_batching.py
RENAMED
|
@@ -1,17 +1,15 @@
|
|
|
1
1
|
import unittest
|
|
2
|
-
from os.path import join
|
|
3
2
|
|
|
4
3
|
from tqdm import tqdm
|
|
5
4
|
|
|
6
|
-
from bulk_chain.api import
|
|
5
|
+
from bulk_chain.api import iter_content
|
|
7
6
|
from bulk_chain.core.utils import dynamic_init
|
|
8
7
|
from utils import iter_test_jsonl_samples
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
class TestProviderBatching(unittest.TestCase):
|
|
12
11
|
|
|
13
|
-
llm = dynamic_init(
|
|
14
|
-
class_filepath="providers/transformers_flan_t5.py",
|
|
12
|
+
llm = dynamic_init(class_filepath="providers/transformers_flan_t5.py",
|
|
15
13
|
class_name="FlanT5")(model_name="nicolay-r/flan-t5-tsa-thor-base",
|
|
16
14
|
max_new_tokens=128)
|
|
17
15
|
|
|
@@ -19,9 +17,9 @@ class TestProviderBatching(unittest.TestCase):
|
|
|
19
17
|
input_dicts_it = iter_test_jsonl_samples()
|
|
20
18
|
data_it = iter_content(input_dicts_it=input_dicts_it,
|
|
21
19
|
llm=self.llm,
|
|
20
|
+
infer_mode="batch",
|
|
22
21
|
batch_size=10,
|
|
23
22
|
return_batch=False,
|
|
24
|
-
handle_missed_value_func=lambda *_: None,
|
|
25
23
|
schema="schema/thor_cot_schema.json")
|
|
26
24
|
|
|
27
25
|
for item in tqdm(data_it):
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
from bulk_chain.api import iter_content
|
|
4
|
+
from utils import iter_test_jsonl_samples, DEFAULT_REMOTE_LLM
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestAPI_Streaming(unittest.TestCase):
|
|
8
|
+
|
|
9
|
+
def test_content_iter_mode(self):
|
|
10
|
+
|
|
11
|
+
input_dicts_it = iter_test_jsonl_samples()
|
|
12
|
+
data_it = iter_content(input_dicts_it=input_dicts_it,
|
|
13
|
+
llm=DEFAULT_REMOTE_LLM,
|
|
14
|
+
batch_size=1,
|
|
15
|
+
infer_mode="single_stream",
|
|
16
|
+
return_mode="chunk",
|
|
17
|
+
attempts=2,
|
|
18
|
+
schema="schema/thor_cot_schema.json")
|
|
19
|
+
|
|
20
|
+
for ind_in_batch, col, item in data_it:
|
|
21
|
+
print("\t".join([str(ind_in_batch), str(col), item]))
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from timeit import default_timer as timer
|
|
2
|
+
from utils import DEFAULT_REMOTE_LLM
|
|
3
|
+
|
|
4
|
+
start = timer()
|
|
5
|
+
r = ["".join([str(s) for s in DEFAULT_REMOTE_LLM.ask(f"what's the color of the {p}")])
|
|
6
|
+
for p in ["sky", "ground", "water"]]
|
|
7
|
+
end = timer()
|
|
8
|
+
|
|
9
|
+
total = sum(len(i) for i in r)
|
|
10
|
+
print(f"Completed [time: {end - start}]: {len(r[0])}, {len(r[1])}, {len(r[2])}")
|
|
11
|
+
print(f"TPS: {total / (end-start)}")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from timeit import default_timer as timer
|
|
2
|
+
import asyncio
|
|
3
|
+
|
|
4
|
+
from utils import DEFAULT_REMOTE_LLM
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def infer_item(prompt):
|
|
8
|
+
content = []
|
|
9
|
+
for chunk in DEFAULT_REMOTE_LLM.ask(prompt):
|
|
10
|
+
content.append(str(chunk))
|
|
11
|
+
return content
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def coro_infer_llm(prompt):
|
|
15
|
+
print(f"launch: {prompt}")
|
|
16
|
+
r = None
|
|
17
|
+
for response in asyncio.as_completed([infer_item(prompt)]):
|
|
18
|
+
r = await response
|
|
19
|
+
return "".join(r)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def main():
|
|
23
|
+
batch = [f"what's the color of the {p}" for p in ["sky", "ground", "water"]]
|
|
24
|
+
routines = [coro_infer_llm(p) for p in batch]
|
|
25
|
+
return await asyncio.gather(*routines)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
start = timer()
|
|
29
|
+
r = asyncio.run(main())
|
|
30
|
+
end = timer()
|
|
31
|
+
|
|
32
|
+
total = sum(len(i) for i in r)
|
|
33
|
+
print(r[0])
|
|
34
|
+
print(r[1])
|
|
35
|
+
print(r[2])
|
|
36
|
+
print(f"Completed [time: {end - start}]: {len(r[0])}, {len(r[1])}, {len(r[2])}")
|
|
37
|
+
print(f"TPS: {total / (end-start)}")
|
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
import os
|
|
3
|
-
from itertools import chain
|
|
4
|
-
|
|
5
|
-
from bulk_chain.core.llm_base import BaseLM
|
|
6
|
-
from bulk_chain.core.service_batch import BatchIterator
|
|
7
|
-
from bulk_chain.core.service_data import DataService
|
|
8
|
-
from bulk_chain.core.service_dict import DictionaryService
|
|
9
|
-
from bulk_chain.core.service_json import JsonService
|
|
10
|
-
from bulk_chain.core.service_schema import SchemaService
|
|
11
|
-
from bulk_chain.core.utils import dynamic_init, find_by_prefix
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
INFER_MODES = {
|
|
15
|
-
"batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
|
|
16
|
-
DataService.limit_prompts(batch, limit=limit_prompt))
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
CWD = os.getcwd()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _iter_entry_content(entry, entry_info=None, **kwargs):
|
|
24
|
-
|
|
25
|
-
if isinstance(entry, str):
|
|
26
|
-
kwargs.get("callback_str_func", lambda *_: None)(entry, entry_info)
|
|
27
|
-
yield entry
|
|
28
|
-
elif isinstance(entry, collections.abc.Iterable):
|
|
29
|
-
h = kwargs.get("callback_stream_func", lambda *_: None)
|
|
30
|
-
h(None, entry_info | {"action": "start"})
|
|
31
|
-
for chunk in map(lambda item: str(item), entry):
|
|
32
|
-
yield chunk
|
|
33
|
-
h(chunk, entry_info)
|
|
34
|
-
h(None, entry_info | {"action": "end"})
|
|
35
|
-
else:
|
|
36
|
-
raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def _iter_batch_prompts(c, batch_content_it, **kwargs):
|
|
40
|
-
for ind_in_batch, entry in enumerate(batch_content_it):
|
|
41
|
-
content = DataService.get_prompt_text(
|
|
42
|
-
prompt=entry[c]["prompt"],
|
|
43
|
-
data_dict=entry,
|
|
44
|
-
handle_missed_func=kwargs["handle_missed_value_func"])
|
|
45
|
-
yield ind_in_batch, content
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _iter_batch_responses(p_column, c, batch_content_it, **kwargs):
|
|
49
|
-
p_batch = [item[p_column] for item in batch_content_it]
|
|
50
|
-
# TODO. This part could be async.
|
|
51
|
-
# TODO. ind_in_batch might be a part of the async return.
|
|
52
|
-
for ind_in_batch, entry in enumerate(kwargs["handle_batch_func"](p_batch)):
|
|
53
|
-
yield ind_in_batch, _iter_entry_content(entry=entry, entry_info={"ind": ind_in_batch, "param": c}, **kwargs)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _infer_batch(batch, schema, return_mode, cols=None, **kwargs):
|
|
57
|
-
assert (isinstance(batch, list))
|
|
58
|
-
|
|
59
|
-
if len(batch) == 0:
|
|
60
|
-
return batch
|
|
61
|
-
|
|
62
|
-
if cols is None:
|
|
63
|
-
first_item = batch[0]
|
|
64
|
-
cols = list(first_item.keys()) if cols is None else cols
|
|
65
|
-
|
|
66
|
-
for c in cols:
|
|
67
|
-
|
|
68
|
-
# Handling prompt column.
|
|
69
|
-
if c in schema.p2r:
|
|
70
|
-
content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
|
|
71
|
-
for ind_in_batch, prompt in content_it:
|
|
72
|
-
batch[ind_in_batch][c] = prompt
|
|
73
|
-
|
|
74
|
-
# Handling column for inference.
|
|
75
|
-
if c in schema.r2p:
|
|
76
|
-
content_it = _iter_batch_responses(c=c, p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
|
|
77
|
-
for ind_in_batch, chunk_it in content_it:
|
|
78
|
-
|
|
79
|
-
chunks = []
|
|
80
|
-
for chunk in chunk_it:
|
|
81
|
-
chunks.append(chunk)
|
|
82
|
-
|
|
83
|
-
if return_mode == "chunk":
|
|
84
|
-
yield [ind_in_batch, c, chunk]
|
|
85
|
-
|
|
86
|
-
batch[ind_in_batch][c] = "".join(chunks)
|
|
87
|
-
|
|
88
|
-
if return_mode == "record":
|
|
89
|
-
for record in batch:
|
|
90
|
-
yield record
|
|
91
|
-
|
|
92
|
-
if return_mode == "batch":
|
|
93
|
-
yield batch
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None, return_mode="batch", **kwargs):
|
|
97
|
-
""" This method represent Python API aimed at application of `llm` towards
|
|
98
|
-
iterator of input_dicts via cache_target that refers to the SQLite using
|
|
99
|
-
the given `schema`
|
|
100
|
-
"""
|
|
101
|
-
assert (return_mode in ["batch", "chunk"])
|
|
102
|
-
assert (isinstance(llm, BaseLM))
|
|
103
|
-
|
|
104
|
-
# Quick initialization of the schema.
|
|
105
|
-
if isinstance(schema, str):
|
|
106
|
-
schema = JsonService.read(schema)
|
|
107
|
-
if isinstance(schema, dict):
|
|
108
|
-
schema = SchemaService(json_data=schema)
|
|
109
|
-
|
|
110
|
-
prompts_it = map(
|
|
111
|
-
lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
|
|
112
|
-
input_dicts_it
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
content_it = (_infer_batch(batch=batch,
|
|
116
|
-
handle_batch_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
|
|
117
|
-
return_mode=return_mode,
|
|
118
|
-
schema=schema,
|
|
119
|
-
**kwargs)
|
|
120
|
-
for batch in BatchIterator(prompts_it, batch_size=batch_size))
|
|
121
|
-
|
|
122
|
-
yield from chain.from_iterable(content_it)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def init_llm(adapter, **model_kwargs):
|
|
126
|
-
""" This method perform dynamic initialization of LLM from third-party resource.
|
|
127
|
-
"""
|
|
128
|
-
assert (isinstance(adapter, str))
|
|
129
|
-
|
|
130
|
-
# List of the Supported models and their API wrappers.
|
|
131
|
-
models_preset = {
|
|
132
|
-
"dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
|
|
133
|
-
class_name=llm_model_params)(**model_kwargs)
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
# Initialize LLM model.
|
|
137
|
-
params = adapter.split(':')
|
|
138
|
-
llm_model_type = params[0]
|
|
139
|
-
llm_model_name = params[1] if len(params) > 1 else params[-1]
|
|
140
|
-
llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
|
|
141
|
-
llm = find_by_prefix(d=models_preset, key=llm_model_type)()
|
|
142
|
-
|
|
143
|
-
return llm, llm_model_name
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import time
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class BaseLM(object):
|
|
6
|
-
|
|
7
|
-
def __init__(self, name=None, attempts=None, delay_sec=1, enable_log=True,
|
|
8
|
-
support_batching=False, **kwargs):
|
|
9
|
-
|
|
10
|
-
self.__name = name
|
|
11
|
-
self.__attempts = 1 if attempts is None else attempts
|
|
12
|
-
self.__delay_sec = delay_sec
|
|
13
|
-
self.__support_batching = support_batching
|
|
14
|
-
|
|
15
|
-
if enable_log:
|
|
16
|
-
self.__logger = logging.getLogger(__name__)
|
|
17
|
-
logging.basicConfig(level=logging.INFO)
|
|
18
|
-
|
|
19
|
-
def ask_core(self, batch):
|
|
20
|
-
|
|
21
|
-
for i in range(self.__attempts):
|
|
22
|
-
try:
|
|
23
|
-
if self.__support_batching:
|
|
24
|
-
# Launch in batch mode.
|
|
25
|
-
content = batch
|
|
26
|
-
else:
|
|
27
|
-
# Launch in non-batch mode.
|
|
28
|
-
assert len(batch) == 1, "The LM does not support batching," \
|
|
29
|
-
f" while size of the content is {len(batch)} which is not equal 1. " \
|
|
30
|
-
f"Please enable batch-supporting or set required inference settings."
|
|
31
|
-
content = batch[0]
|
|
32
|
-
|
|
33
|
-
response = self.ask(content)
|
|
34
|
-
|
|
35
|
-
# Wrapping into batch the response in the case of non-batching mode.
|
|
36
|
-
return response if self.__support_batching else [response]
|
|
37
|
-
|
|
38
|
-
except Exception as e:
|
|
39
|
-
if self.__logger is not None:
|
|
40
|
-
self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
|
|
41
|
-
self.__logger.info(e)
|
|
42
|
-
time.sleep(self.__delay_sec)
|
|
43
|
-
|
|
44
|
-
raise Exception("Can't infer")
|
|
45
|
-
|
|
46
|
-
def ask(self, content):
|
|
47
|
-
raise NotImplemented()
|
|
48
|
-
|
|
49
|
-
def name(self):
|
|
50
|
-
return self.__name.replace("/", "_")
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
tqdm
|
bulk_chain-1.0.0/test/test.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import time
|
|
3
|
-
import logging
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def setup_logger_behaviour(name: str) -> logging.Logger:
|
|
7
|
-
root_handlers = logging.getLogger().handlers # gets root logger
|
|
8
|
-
current_logger = logging.getLogger(name) # gets current logger
|
|
9
|
-
if not root_handlers: # if root logger has no handlers then create streaming handeler only
|
|
10
|
-
new_handler = logging.StreamHandler()
|
|
11
|
-
new_handler.terminator = ""
|
|
12
|
-
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
13
|
-
current_logger.addHandler(new_handler)
|
|
14
|
-
current_logger.propagate = False
|
|
15
|
-
current_logger.setLevel(logging.INFO)
|
|
16
|
-
return current_logger
|
|
17
|
-
|
|
18
|
-
# Remove exixting Handlers from the current logger
|
|
19
|
-
for handler in current_logger.handlers[:]:
|
|
20
|
-
current_logger.removeHandler(handler)
|
|
21
|
-
|
|
22
|
-
for handler_r in root_handlers: # if root logger has handlers
|
|
23
|
-
if type(handler_r) is logging.StreamHandler: # if root logger has streaming handler
|
|
24
|
-
new_handler = logging.StreamHandler()
|
|
25
|
-
new_handler.terminator = "" # This will stop the printing in new line
|
|
26
|
-
new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
|
|
27
|
-
current_logger.addHandler(new_handler)
|
|
28
|
-
elif type(handler_r) is logging.FileHandler: # if root logger has file handler
|
|
29
|
-
new_handler = logging.FileHandler( # create new file handler
|
|
30
|
-
handler_r.baseFilename, # with same filename and other properties
|
|
31
|
-
handler_r.mode,
|
|
32
|
-
handler_r.encoding,
|
|
33
|
-
handler_r.delay,
|
|
34
|
-
handler_r.errors,
|
|
35
|
-
)
|
|
36
|
-
new_handler.terminator = "" # This will stop the printing in new line
|
|
37
|
-
new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
|
|
38
|
-
current_logger.addHandler(new_handler)
|
|
39
|
-
else:
|
|
40
|
-
continue
|
|
41
|
-
current_logger.propagate = False # Don't propagate to root logger
|
|
42
|
-
return current_logger
|
|
43
|
-
|
|
44
|
-
# Configure the logger
|
|
45
|
-
logger =logging.getLogger(__name__)
|
|
46
|
-
class FakeStreamingDataGenerator:
|
|
47
|
-
|
|
48
|
-
def stream_data(self):
|
|
49
|
-
while True:
|
|
50
|
-
data = random.randint(0, 100)
|
|
51
|
-
yield data
|
|
52
|
-
time.sleep(0.5)
|
|
53
|
-
|
|
54
|
-
# Example usage:
|
|
55
|
-
generator = FakeStreamingDataGenerator()
|
|
56
|
-
stream = generator.stream_data()
|
|
57
|
-
|
|
58
|
-
logger = setup_logger_behaviour(__name__) # call you set up function here
|
|
59
|
-
while True:
|
|
60
|
-
chunk = next(stream)
|
|
61
|
-
# Replacing print with logger
|
|
62
|
-
logger.info(chunk) # Best practice now
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
from os.path import join
|
|
3
|
-
|
|
4
|
-
from bulk_chain.api import iter_content, CWD
|
|
5
|
-
from bulk_chain.core.utils import dynamic_init
|
|
6
|
-
from utils import current_dir, API_TOKEN
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class TestAPI(unittest.TestCase):
|
|
10
|
-
|
|
11
|
-
llm = dynamic_init(class_dir=join(CWD, ".."),
|
|
12
|
-
class_filepath="providers/replicate_104.py",
|
|
13
|
-
class_name="Replicate")(api_token=API_TOKEN,
|
|
14
|
-
model_name="deepseek-ai/deepseek-r1")
|
|
15
|
-
|
|
16
|
-
@staticmethod
|
|
17
|
-
def it_data(n):
|
|
18
|
-
for i in range(n):
|
|
19
|
-
yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
|
|
20
|
-
|
|
21
|
-
def test_iter(self):
|
|
22
|
-
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
23
|
-
llm=self.llm,
|
|
24
|
-
batch_size=1,
|
|
25
|
-
handle_missed_value_func=lambda *_: None,
|
|
26
|
-
return_mode="batch",
|
|
27
|
-
schema=join(current_dir, "schema/default.json"))
|
|
28
|
-
|
|
29
|
-
for data in data_it:
|
|
30
|
-
print(data)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
if __name__ == '__main__':
|
|
34
|
-
unittest.main()
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
from os.path import join
|
|
3
|
-
|
|
4
|
-
from tqdm import tqdm
|
|
5
|
-
|
|
6
|
-
from bulk_chain.api import CWD, iter_content
|
|
7
|
-
from bulk_chain.core.utils import dynamic_init
|
|
8
|
-
from utils import API_TOKEN, iter_test_jsonl_samples
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TestAPI_Streaming(unittest.TestCase):
|
|
12
|
-
|
|
13
|
-
llm = dynamic_init(class_dir=join(CWD, ".."),
|
|
14
|
-
class_filepath="providers/replicate_104.py",
|
|
15
|
-
class_name="Replicate")(api_token=API_TOKEN,
|
|
16
|
-
model_name="meta/meta-llama-3-70b-instruct",
|
|
17
|
-
stream=True)
|
|
18
|
-
|
|
19
|
-
def test_callback_mode(self):
|
|
20
|
-
|
|
21
|
-
def callback(chunk, info):
|
|
22
|
-
if chunk is None and info["action"] == "start":
|
|
23
|
-
print(f"\n{info['param']} (batch_ind={info['ind']}):\n")
|
|
24
|
-
return
|
|
25
|
-
if chunk is None and info["action"] == "end":
|
|
26
|
-
print("\n\n")
|
|
27
|
-
return
|
|
28
|
-
print(chunk, end="")
|
|
29
|
-
|
|
30
|
-
input_dicts_it = iter_test_jsonl_samples()
|
|
31
|
-
data_it = iter_content(input_dicts_it=input_dicts_it,
|
|
32
|
-
llm=self.llm,
|
|
33
|
-
return_batch=False,
|
|
34
|
-
callback_stream_func=callback,
|
|
35
|
-
handle_missed_value_func=lambda *_: None,
|
|
36
|
-
schema="schema/thor_cot_schema.json")
|
|
37
|
-
|
|
38
|
-
for _ in tqdm(data_it):
|
|
39
|
-
print("\n|NEXT ENTRY|\n")
|
|
40
|
-
|
|
41
|
-
def test_content_iter_mode(self):
|
|
42
|
-
|
|
43
|
-
input_dicts_it = iter_test_jsonl_samples()
|
|
44
|
-
data_it = iter_content(input_dicts_it=input_dicts_it,
|
|
45
|
-
llm=self.llm,
|
|
46
|
-
batch_size=1,
|
|
47
|
-
return_mode="chunk",
|
|
48
|
-
handle_missed_value_func=lambda *_: None,
|
|
49
|
-
schema="schema/thor_cot_schema.json")
|
|
50
|
-
|
|
51
|
-
for ind_in_batch, col, item in data_it:
|
|
52
|
-
print("\t".join([str(ind_in_batch), str(col), item]))
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
|
|
3
|
-
from bulk_chain.core.utils import iter_params
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class TestArgumentsSeeking(unittest.TestCase):
|
|
7
|
-
|
|
8
|
-
def test(self):
|
|
9
|
-
params = list(iter_params("X is a {x} and p is {text} and for {k}"))
|
|
10
|
-
|
|
11
|
-
line = ",".join(["{{{x}}}".format(x=x) for x in params])
|
|
12
|
-
print(line)
|
|
13
|
-
d_params = {}
|
|
14
|
-
for param in params:
|
|
15
|
-
d_params[param] = 2
|
|
16
|
-
print(d_params)
|
|
17
|
-
|
|
18
|
-
z = line.format(**d_params)
|
|
19
|
-
print(z)
|
|
20
|
-
|
|
21
|
-
b = list(iter_params("X"))
|
|
22
|
-
print(b)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if __name__ == '__main__':
|
|
26
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|