bulk-chain 1.1.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bulk_chain-1.2.1/PKG-INFO +134 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/README.md +45 -29
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/api.py +60 -17
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/llm_base.py +8 -3
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/service_schema.py +5 -1
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/utils.py +29 -5
- bulk_chain-1.2.1/bulk_chain.egg-info/PKG-INFO +134 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain.egg-info/SOURCES.txt +1 -1
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/setup.py +1 -1
- bulk_chain-1.2.1/test/test_api.py +64 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/test/test_api_batching.py +3 -4
- bulk_chain-1.2.1/test/test_api_novita.py +34 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/test/test_api_streaming.py +3 -4
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/test/test_replicate_async_baseline.py +5 -2
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/test/test_replicate_async_batch_async.py +5 -2
- bulk_chain-1.1.0/LICENSE +0 -21
- bulk_chain-1.1.0/PKG-INFO +0 -118
- bulk_chain-1.1.0/bulk_chain.egg-info/PKG-INFO +0 -118
- bulk_chain-1.1.0/test/test_api.py +0 -67
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/__init__.py +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/__init__.py +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/service_asyncio.py +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/service_batch.py +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/service_data.py +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/service_dict.py +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain/core/service_json.py +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain.egg-info/dependency_links.txt +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/bulk_chain.egg-info/top_level.txt +0 -0
- {bulk_chain-1.1.0 → bulk_chain-1.2.1}/setup.cfg +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: bulk_chain
|
|
3
|
+
Version: 1.2.1
|
|
4
|
+
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
|
+
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
|
+
Author: Nicolay Rusnachenko
|
|
7
|
+
Author-email: rusnicolay@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Description: # bulk-chain 1.2.1
|
|
10
|
+

|
|
11
|
+
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
12
|
+
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
13
|
+
[](https://pypistats.org/packages/bulk-chain)
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<img src="logo.png"/>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
21
|
+
<br>
|
|
22
|
+
<a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
26
|
+
|
|
27
|
+
### Main Features
|
|
28
|
+
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
29
|
+
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
30
|
+
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
31
|
+
|
|
32
|
+
# Installation
|
|
33
|
+
|
|
34
|
+
From PyPI:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install --no-deps bulk-chain
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
or latest version from here:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Chain-of-Thought Schema
|
|
47
|
+
|
|
48
|
+
To declare Chain-of-Though (CoT) schema we use `JSON` format.
|
|
49
|
+
The field `schema` is a list of CoT instructions for the Large Language Model.
|
|
50
|
+
Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
51
|
+
All the variable names should be mentioned in `{}`.
|
|
52
|
+
|
|
53
|
+
**Example**:
|
|
54
|
+
```python
|
|
55
|
+
[
|
|
56
|
+
{"prompt": "extract topic: {text}", "out": "topic"},
|
|
57
|
+
{"prompt": "extract subject: {text}", "out": "subject"},
|
|
58
|
+
]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
# Usage
|
|
62
|
+
|
|
63
|
+
## 🤖 Prepare
|
|
64
|
+
|
|
65
|
+
1. [schema](#chain-of-thought-schema)
|
|
66
|
+
* [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
|
|
67
|
+
2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
68
|
+
3. Data (iter of dictionaries)
|
|
69
|
+
|
|
70
|
+
## 🚀 Launch
|
|
71
|
+
|
|
72
|
+
> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from bulk_chain.core.utils import dynamic_init
|
|
76
|
+
from bulk_chain.api import iter_content
|
|
77
|
+
|
|
78
|
+
content_it = iter_content(
|
|
79
|
+
# 1. Your schema.
|
|
80
|
+
schema=[
|
|
81
|
+
{"prompt": "extract topic: {text}", "out": "topic" },
|
|
82
|
+
{"prompt": "extract subject: {text}", "out": "subject"},
|
|
83
|
+
],
|
|
84
|
+
# 2. Your third-party model implementation.
|
|
85
|
+
llm=dynamic_init(class_filepath="replicate_104.py")(
|
|
86
|
+
api_token="<API-KEY>",
|
|
87
|
+
model_name="meta/meta-llama-3-70b-instruct"),
|
|
88
|
+
# 3. Toggle streaming if needed
|
|
89
|
+
stream=False,
|
|
90
|
+
# 4. Toggle Async API mode usage.
|
|
91
|
+
async_mode=True,
|
|
92
|
+
# 5. Batch size.
|
|
93
|
+
batch_size=10,
|
|
94
|
+
# 6. Your iterator of dictionaries
|
|
95
|
+
input_dicts_it=[
|
|
96
|
+
# Example of data ...
|
|
97
|
+
{ "text": "Rocks are hard" },
|
|
98
|
+
{ "text": "Water is wet" },
|
|
99
|
+
{ "text": "Fire is hot" }
|
|
100
|
+
],
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
for batch in content_it:
|
|
104
|
+
for entry in batch:
|
|
105
|
+
print(entry)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Outputs entries represent texts augmented with `topic` and `subject`:
|
|
109
|
+
```jsonl
|
|
110
|
+
{'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
|
|
111
|
+
{'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
|
|
112
|
+
{'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
# API
|
|
116
|
+
|
|
117
|
+
| Method | Mode | Description |
|
|
118
|
+
|----------------------|------------|---------------------------------------------------------------------|
|
|
119
|
+
| `ask(prompt)` | Sync | Infers the model with a single prompt. |
|
|
120
|
+
| `ask_stream(prompt)` | Sync | Returns a generator that yields chunks of the inferred result. |
|
|
121
|
+
| `ask_async(prompt)` | Async | Asynchronously infers the model with a single prompt. |
|
|
122
|
+
| `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result. |
|
|
123
|
+
|
|
124
|
+
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
125
|
+
|
|
126
|
+
Keywords: natural language processing,chain-of-thought,reasoning
|
|
127
|
+
Platform: UNKNOWN
|
|
128
|
+
Classifier: Programming Language :: Python
|
|
129
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
130
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
131
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
132
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
133
|
+
Requires-Python: >=3.6
|
|
134
|
+
Description-Content-Type: text/markdown
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# bulk-chain 1.1
|
|
1
|
+
# bulk-chain 1.2.1
|
|
2
2
|

|
|
3
3
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
4
4
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -37,24 +37,17 @@ pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
|
37
37
|
|
|
38
38
|
## Chain-of-Thought Schema
|
|
39
39
|
|
|
40
|
-
To declare Chain-of-Though (CoT) schema
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
All the variable names are expected to be mentioned in `{}`.
|
|
45
|
-
|
|
46
|
-
Below, is an example on how to declare your own schema:
|
|
40
|
+
To declare Chain-of-Though (CoT) schema we use `JSON` format.
|
|
41
|
+
The field `schema` is a list of CoT instructions for the Large Language Model.
|
|
42
|
+
Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
43
|
+
All the variable names should be mentioned in `{}`.
|
|
47
44
|
|
|
45
|
+
**Example**:
|
|
48
46
|
```python
|
|
49
|
-
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
{"prompt": "Given the question '{text}', let's think step-by-step.",
|
|
53
|
-
"out": "steps"},
|
|
54
|
-
{"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
|
|
55
|
-
"out": "answer"},
|
|
47
|
+
[
|
|
48
|
+
{"prompt": "extract topic: {text}", "out": "topic"},
|
|
49
|
+
{"prompt": "extract subject: {text}", "out": "subject"},
|
|
56
50
|
]
|
|
57
|
-
}
|
|
58
51
|
```
|
|
59
52
|
|
|
60
53
|
# Usage
|
|
@@ -76,25 +69,48 @@ from bulk_chain.api import iter_content
|
|
|
76
69
|
|
|
77
70
|
content_it = iter_content(
|
|
78
71
|
# 1. Your schema.
|
|
79
|
-
schema=
|
|
72
|
+
schema=[
|
|
73
|
+
{"prompt": "extract topic: {text}", "out": "topic" },
|
|
74
|
+
{"prompt": "extract subject: {text}", "out": "subject"},
|
|
75
|
+
],
|
|
80
76
|
# 2. Your third-party model implementation.
|
|
81
|
-
llm=dynamic_init(class_filepath="replicate_104.py"
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
77
|
+
llm=dynamic_init(class_filepath="replicate_104.py")(
|
|
78
|
+
api_token="<API-KEY>",
|
|
79
|
+
model_name="meta/meta-llama-3-70b-instruct"),
|
|
80
|
+
# 3. Toggle streaming if needed
|
|
81
|
+
stream=False,
|
|
82
|
+
# 4. Toggle Async API mode usage.
|
|
83
|
+
async_mode=True,
|
|
84
|
+
# 5. Batch size.
|
|
85
|
+
batch_size=10,
|
|
86
|
+
# 6. Your iterator of dictionaries
|
|
87
|
+
input_dicts_it=[
|
|
88
|
+
# Example of data ...
|
|
89
|
+
{ "text": "Rocks are hard" },
|
|
90
|
+
{ "text": "Water is wet" },
|
|
91
|
+
{ "text": "Fire is hot" }
|
|
92
|
+
],
|
|
87
93
|
)
|
|
88
94
|
|
|
89
|
-
for
|
|
90
|
-
|
|
95
|
+
for batch in content_it:
|
|
96
|
+
for entry in batch:
|
|
97
|
+
print(entry)
|
|
91
98
|
```
|
|
92
99
|
|
|
100
|
+
Outputs entries represent texts augmented with `topic` and `subject`:
|
|
101
|
+
```jsonl
|
|
102
|
+
{'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
|
|
103
|
+
{'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
|
|
104
|
+
{'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
|
|
105
|
+
```
|
|
93
106
|
|
|
94
|
-
#
|
|
107
|
+
# API
|
|
95
108
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
109
|
+
| Method | Mode | Description |
|
|
110
|
+
|----------------------|------------|---------------------------------------------------------------------|
|
|
111
|
+
| `ask(prompt)` | Sync | Infers the model with a single prompt. |
|
|
112
|
+
| `ask_stream(prompt)` | Sync | Returns a generator that yields chunks of the inferred result. |
|
|
113
|
+
| `ask_async(prompt)` | Async | Asynchronously infers the model with a single prompt. |
|
|
114
|
+
| `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result. |
|
|
99
115
|
|
|
100
116
|
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
@@ -3,6 +3,7 @@ import collections
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
from itertools import chain
|
|
6
|
+
from types import AsyncGeneratorType
|
|
6
7
|
|
|
7
8
|
from bulk_chain.core.llm_base import BaseLM
|
|
8
9
|
from bulk_chain.core.service_asyncio import AsyncioService
|
|
@@ -16,8 +17,8 @@ from bulk_chain.core.utils import attempt_wrapper
|
|
|
16
17
|
|
|
17
18
|
INFER_MODES = {
|
|
18
19
|
"single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
|
|
20
|
+
"batch": lambda llm, batch, **kwargs: llm.ask_batch(batch),
|
|
19
21
|
"single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
|
|
20
|
-
"batch": lambda llm, batch, **kwargs: llm.ask(batch),
|
|
21
22
|
"batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
|
|
22
23
|
batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
|
|
23
24
|
),
|
|
@@ -69,6 +70,9 @@ def __handle_gen(handle, batch, event_loop):
|
|
|
69
70
|
elif isinstance(entry, collections.abc.Iterable):
|
|
70
71
|
for chunk in map(lambda item: str(item), entry):
|
|
71
72
|
yield chunk
|
|
73
|
+
elif isinstance(entry, AsyncGeneratorType):
|
|
74
|
+
for chunk in AsyncioService.async_gen_to_iter(entry, loop=event_loop):
|
|
75
|
+
yield str(chunk)
|
|
72
76
|
else:
|
|
73
77
|
raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
|
|
74
78
|
|
|
@@ -85,11 +89,14 @@ def _iter_chunks(p_column, batch_content_it, **kwargs):
|
|
|
85
89
|
yield ind_in_batch, chunk
|
|
86
90
|
|
|
87
91
|
|
|
88
|
-
def
|
|
92
|
+
def _column_ordered_chunks_iter(batch, schema, cols=None, keep_prompts=True, **kwargs):
|
|
93
|
+
"""
|
|
94
|
+
NOTE: we populate `batch` content automatically
|
|
95
|
+
"""
|
|
89
96
|
assert (isinstance(batch, list))
|
|
90
97
|
|
|
91
98
|
if len(batch) == 0:
|
|
92
|
-
return
|
|
99
|
+
return
|
|
93
100
|
|
|
94
101
|
if cols is None:
|
|
95
102
|
first_item = batch[0]
|
|
@@ -112,33 +119,66 @@ def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
|
|
|
112
119
|
for ind_in_batch, chunk in content_it:
|
|
113
120
|
# Append batch.
|
|
114
121
|
batch[ind_in_batch][c].append(chunk)
|
|
115
|
-
|
|
116
|
-
if return_mode == "chunk":
|
|
117
|
-
global_ind = batch_ind * len(batch) + ind_in_batch
|
|
118
|
-
yield [global_ind, c, chunk]
|
|
122
|
+
yield [ind_in_batch, c, chunk]
|
|
119
123
|
|
|
120
124
|
# Convert content to string.
|
|
121
125
|
for item in batch:
|
|
122
126
|
item[c] = "".join(item[c])
|
|
123
127
|
|
|
124
|
-
if
|
|
128
|
+
if not keep_prompts:
|
|
129
|
+
for batch_item in batch:
|
|
130
|
+
for key in list(batch_item.keys()):
|
|
131
|
+
prompt_col = SchemaService.col_to_prompt(col_name=key, prompt_data=batch_item)
|
|
132
|
+
if prompt_col in batch_item:
|
|
133
|
+
del batch_item[prompt_col]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _infer_batch(return_type, batch, batch_ind, **kwargs):
|
|
137
|
+
assert (return_type in ["batch", "chunk", "record"])
|
|
138
|
+
|
|
139
|
+
# Filling batch with inference content.
|
|
140
|
+
for ind_in_batch, column, chunk in _column_ordered_chunks_iter(batch=batch, **kwargs):
|
|
141
|
+
if return_type == "chunk":
|
|
142
|
+
global_ind = batch_ind * len(batch) + ind_in_batch
|
|
143
|
+
yield [global_ind, column, chunk]
|
|
144
|
+
|
|
145
|
+
if return_type == "record":
|
|
125
146
|
for record in batch:
|
|
126
147
|
yield record
|
|
127
148
|
|
|
128
|
-
if
|
|
149
|
+
if return_type == "batch":
|
|
129
150
|
yield batch
|
|
130
151
|
|
|
131
152
|
|
|
153
|
+
def get_infer_mode(stream, batch_size, async_mode):
|
|
154
|
+
if not stream and batch_size == 1:
|
|
155
|
+
return 'single', 'record'
|
|
156
|
+
elif not stream and batch_size > 1:
|
|
157
|
+
if async_mode:
|
|
158
|
+
return 'batch_async', 'batch'
|
|
159
|
+
else:
|
|
160
|
+
return 'batch', 'batch'
|
|
161
|
+
elif stream and batch_size == 1:
|
|
162
|
+
return 'single_stream', 'chunk'
|
|
163
|
+
elif stream and batch_size > 1:
|
|
164
|
+
return 'batch_stream_async', 'chunk'
|
|
165
|
+
|
|
166
|
+
raise ValueError(f"Invalid combination of stream and batch_size: {stream}, {batch_size}")
|
|
167
|
+
|
|
168
|
+
|
|
132
169
|
def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
|
|
133
|
-
|
|
134
|
-
**kwargs):
|
|
170
|
+
stream=False, async_mode=False, attempts=1, event_loop=None,
|
|
171
|
+
handle_missed_value_func=lambda *_: None, **kwargs):
|
|
135
172
|
""" This method represent Python API aimed at application of `llm` towards
|
|
136
173
|
iterator of input_dicts via cache_target that refers to the SQLite using
|
|
137
174
|
the given `schema`
|
|
138
175
|
"""
|
|
139
|
-
assert (infer_mode in INFER_MODES.keys())
|
|
140
|
-
assert (return_mode in ["batch", "chunk", "record"])
|
|
141
176
|
assert (isinstance(llm, BaseLM))
|
|
177
|
+
assert (isinstance(batch_size, int) and batch_size > 0)
|
|
178
|
+
assert (isinstance(async_mode, bool))
|
|
179
|
+
|
|
180
|
+
infer_type, return_type = get_infer_mode(stream=stream, batch_size=batch_size, async_mode=async_mode)
|
|
181
|
+
infer_mode = INFER_MODES[infer_type]
|
|
142
182
|
|
|
143
183
|
# Setup event loop.
|
|
144
184
|
event_loop = asyncio.get_event_loop_policy().get_event_loop() \
|
|
@@ -149,13 +189,15 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
|
|
|
149
189
|
schema = JsonService.read(schema)
|
|
150
190
|
if isinstance(schema, dict):
|
|
151
191
|
schema = SchemaService(json_data=schema)
|
|
192
|
+
if isinstance(schema, list):
|
|
193
|
+
schema = SchemaService(json_data={"schema": schema})
|
|
152
194
|
|
|
153
195
|
prompts_it = map(
|
|
154
196
|
lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
|
|
155
197
|
input_dicts_it
|
|
156
198
|
)
|
|
157
199
|
|
|
158
|
-
handle_batch_func = lambda batch, **handle_kwargs:
|
|
200
|
+
handle_batch_func = lambda batch, **handle_kwargs: infer_mode(
|
|
159
201
|
llm,
|
|
160
202
|
DataService.limit_prompts(batch, limit=limit_prompt),
|
|
161
203
|
**handle_kwargs
|
|
@@ -172,12 +214,13 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
|
|
|
172
214
|
logger=logger)
|
|
173
215
|
handle_batch_func = attempt_dec(handle_batch_func)
|
|
174
216
|
|
|
175
|
-
|
|
217
|
+
kwargs["handle_missed_value_func"] = handle_missed_value_func
|
|
218
|
+
|
|
219
|
+
content_it = (_infer_batch(return_type=return_type,
|
|
220
|
+
batch=batch,
|
|
176
221
|
batch_ind=batch_ind,
|
|
177
222
|
infer_mode=infer_mode,
|
|
178
223
|
handle_batch_func=handle_batch_func,
|
|
179
|
-
handle_missed_value_func=lambda *_: None,
|
|
180
|
-
return_mode=return_mode,
|
|
181
224
|
schema=schema,
|
|
182
225
|
event_loop=event_loop,
|
|
183
226
|
**kwargs)
|
|
@@ -3,12 +3,17 @@ class BaseLM(object):
|
|
|
3
3
|
def __init__(self, **kwargs):
|
|
4
4
|
pass
|
|
5
5
|
|
|
6
|
-
def ask(self,
|
|
6
|
+
def ask(self, prompt):
|
|
7
7
|
""" Assumes to return str.
|
|
8
8
|
"""
|
|
9
9
|
raise NotImplemented()
|
|
10
10
|
|
|
11
|
-
def
|
|
11
|
+
def ask_batch(self, batch):
|
|
12
|
+
""" Assumes to return generator.
|
|
13
|
+
"""
|
|
14
|
+
raise NotImplemented()
|
|
15
|
+
|
|
16
|
+
def ask_stream(self, prompt):
|
|
12
17
|
""" Assumes to return generator.
|
|
13
18
|
"""
|
|
14
19
|
raise NotImplemented()
|
|
@@ -18,7 +23,7 @@ class BaseLM(object):
|
|
|
18
23
|
"""
|
|
19
24
|
raise NotImplemented()
|
|
20
25
|
|
|
21
|
-
async def ask_stream_async(self,
|
|
26
|
+
async def ask_stream_async(self, prompt):
|
|
22
27
|
""" Assumes to return AsyncGenerator.
|
|
23
28
|
"""
|
|
24
29
|
raise NotImplemented()
|
|
@@ -9,6 +9,10 @@ class SchemaService(object):
|
|
|
9
9
|
prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
|
|
10
10
|
return cls(prompt_schema)
|
|
11
11
|
|
|
12
|
+
@staticmethod
|
|
13
|
+
def col_to_prompt(col_name, prompt_data):
|
|
14
|
+
return col_name + "_prompt" if "in" not in prompt_data else prompt_data["in"]
|
|
15
|
+
|
|
12
16
|
@staticmethod
|
|
13
17
|
def __init_schema(prompts):
|
|
14
18
|
|
|
@@ -19,7 +23,7 @@ class SchemaService(object):
|
|
|
19
23
|
|
|
20
24
|
for prompt in prompts:
|
|
21
25
|
r_col_name = prompt["out"]
|
|
22
|
-
p_col_name = r_col_name
|
|
26
|
+
p_col_name = SchemaService.col_to_prompt(col_name=r_col_name, prompt_data=prompt)
|
|
23
27
|
|
|
24
28
|
assert r_col_name not in schema_r2p, f"`{r_col_name}` has been already declared!"
|
|
25
29
|
assert p_col_name not in schema_p2r, f"`{p_col_name}` has been already declared!"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import ast
|
|
1
2
|
import importlib
|
|
2
3
|
import logging
|
|
3
4
|
import sys
|
|
@@ -35,18 +36,30 @@ def find_by_prefix(d, key):
|
|
|
35
36
|
return d[matches[0]]
|
|
36
37
|
|
|
37
38
|
|
|
39
|
+
def check_is_param_name(param_name):
|
|
40
|
+
return param_name.replace("_", "").isalpha()
|
|
41
|
+
|
|
42
|
+
|
|
38
43
|
def iter_params(text):
|
|
39
44
|
assert(isinstance(text, str))
|
|
40
45
|
beg = 0
|
|
41
46
|
while beg < len(text):
|
|
47
|
+
print(beg)
|
|
42
48
|
try:
|
|
43
49
|
pb = text.index('{', beg)
|
|
44
50
|
except ValueError:
|
|
45
51
|
break
|
|
46
|
-
pe = text.index('}',
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
52
|
+
pe = text.index('}', pb+1)
|
|
53
|
+
param_name = text[pb + 1:pe]
|
|
54
|
+
|
|
55
|
+
# Check parameter validity.
|
|
56
|
+
if not check_is_param_name(param_name):
|
|
57
|
+
beg = pb + 1
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
# Passing.
|
|
61
|
+
yield param_name
|
|
62
|
+
beg = pe + 1
|
|
50
63
|
|
|
51
64
|
|
|
52
65
|
def auto_import(name, is_class=False):
|
|
@@ -61,6 +74,17 @@ def auto_import(name, is_class=False):
|
|
|
61
74
|
return m() if is_class else m
|
|
62
75
|
|
|
63
76
|
|
|
77
|
+
def get_class_name(file_path):
|
|
78
|
+
with open(file_path, 'r') as f:
|
|
79
|
+
tree = ast.parse(f.read(), filename=file_path)
|
|
80
|
+
|
|
81
|
+
for node in ast.walk(tree):
|
|
82
|
+
if isinstance(node, ast.ClassDef):
|
|
83
|
+
return node.name
|
|
84
|
+
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
64
88
|
def dynamic_init(class_filepath, class_name=None):
|
|
65
89
|
|
|
66
90
|
# Registering path.
|
|
@@ -75,7 +99,7 @@ def dynamic_init(class_filepath, class_name=None):
|
|
|
75
99
|
class_filename = class_filename[:-len(".py")]
|
|
76
100
|
|
|
77
101
|
# Loading library.
|
|
78
|
-
class_name =
|
|
102
|
+
class_name = get_class_name(class_filepath) if class_name is None else class_name
|
|
79
103
|
class_path = ".".join([class_filename, class_name])
|
|
80
104
|
logger.info(f"Dynamic loading for the file and class `{class_path}`")
|
|
81
105
|
cls = auto_import(class_path, is_class=False)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: bulk-chain
|
|
3
|
+
Version: 1.2.1
|
|
4
|
+
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
|
+
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
|
+
Author: Nicolay Rusnachenko
|
|
7
|
+
Author-email: rusnicolay@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Description: # bulk-chain 1.2.1
|
|
10
|
+

|
|
11
|
+
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
12
|
+
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
13
|
+
[](https://pypistats.org/packages/bulk-chain)
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<img src="logo.png"/>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
21
|
+
<br>
|
|
22
|
+
<a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
26
|
+
|
|
27
|
+
### Main Features
|
|
28
|
+
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
29
|
+
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
30
|
+
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
31
|
+
|
|
32
|
+
# Installation
|
|
33
|
+
|
|
34
|
+
From PyPI:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install --no-deps bulk-chain
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
or latest version from here:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Chain-of-Thought Schema
|
|
47
|
+
|
|
48
|
+
To declare Chain-of-Though (CoT) schema we use `JSON` format.
|
|
49
|
+
The field `schema` is a list of CoT instructions for the Large Language Model.
|
|
50
|
+
Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
51
|
+
All the variable names should be mentioned in `{}`.
|
|
52
|
+
|
|
53
|
+
**Example**:
|
|
54
|
+
```python
|
|
55
|
+
[
|
|
56
|
+
{"prompt": "extract topic: {text}", "out": "topic"},
|
|
57
|
+
{"prompt": "extract subject: {text}", "out": "subject"},
|
|
58
|
+
]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
# Usage
|
|
62
|
+
|
|
63
|
+
## 🤖 Prepare
|
|
64
|
+
|
|
65
|
+
1. [schema](#chain-of-thought-schema)
|
|
66
|
+
* [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
|
|
67
|
+
2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
68
|
+
3. Data (iter of dictionaries)
|
|
69
|
+
|
|
70
|
+
## 🚀 Launch
|
|
71
|
+
|
|
72
|
+
> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from bulk_chain.core.utils import dynamic_init
|
|
76
|
+
from bulk_chain.api import iter_content
|
|
77
|
+
|
|
78
|
+
content_it = iter_content(
|
|
79
|
+
# 1. Your schema.
|
|
80
|
+
schema=[
|
|
81
|
+
{"prompt": "extract topic: {text}", "out": "topic" },
|
|
82
|
+
{"prompt": "extract subject: {text}", "out": "subject"},
|
|
83
|
+
],
|
|
84
|
+
# 2. Your third-party model implementation.
|
|
85
|
+
llm=dynamic_init(class_filepath="replicate_104.py")(
|
|
86
|
+
api_token="<API-KEY>",
|
|
87
|
+
model_name="meta/meta-llama-3-70b-instruct"),
|
|
88
|
+
# 3. Toggle streaming if needed
|
|
89
|
+
stream=False,
|
|
90
|
+
# 4. Toggle Async API mode usage.
|
|
91
|
+
async_mode=True,
|
|
92
|
+
# 5. Batch size.
|
|
93
|
+
batch_size=10,
|
|
94
|
+
# 6. Your iterator of dictionaries
|
|
95
|
+
input_dicts_it=[
|
|
96
|
+
# Example of data ...
|
|
97
|
+
{ "text": "Rocks are hard" },
|
|
98
|
+
{ "text": "Water is wet" },
|
|
99
|
+
{ "text": "Fire is hot" }
|
|
100
|
+
],
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
for batch in content_it:
|
|
104
|
+
for entry in batch:
|
|
105
|
+
print(entry)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Outputs entries represent texts augmented with `topic` and `subject`:
|
|
109
|
+
```jsonl
|
|
110
|
+
{'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
|
|
111
|
+
{'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
|
|
112
|
+
{'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
# API
|
|
116
|
+
|
|
117
|
+
| Method | Mode | Description |
|
|
118
|
+
|----------------------|------------|---------------------------------------------------------------------|
|
|
119
|
+
| `ask(prompt)` | Sync | Infers the model with a single prompt. |
|
|
120
|
+
| `ask_stream(prompt)` | Sync | Returns a generator that yields chunks of the inferred result. |
|
|
121
|
+
| `ask_async(prompt)` | Async | Asynchronously infers the model with a single prompt. |
|
|
122
|
+
| `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result. |
|
|
123
|
+
|
|
124
|
+
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
125
|
+
|
|
126
|
+
Keywords: natural language processing,chain-of-thought,reasoning
|
|
127
|
+
Platform: UNKNOWN
|
|
128
|
+
Classifier: Programming Language :: Python
|
|
129
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
130
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
131
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
132
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
133
|
+
Requires-Python: >=3.6
|
|
134
|
+
Description-Content-Type: text/markdown
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
LICENSE
|
|
2
1
|
README.md
|
|
3
2
|
setup.py
|
|
4
3
|
bulk_chain/__init__.py
|
|
@@ -18,6 +17,7 @@ bulk_chain/core/service_schema.py
|
|
|
18
17
|
bulk_chain/core/utils.py
|
|
19
18
|
test/test_api.py
|
|
20
19
|
test/test_api_batching.py
|
|
20
|
+
test/test_api_novita.py
|
|
21
21
|
test/test_api_streaming.py
|
|
22
22
|
test/test_replicate_async_baseline.py
|
|
23
23
|
test/test_replicate_async_batch_async.py
|
|
@@ -15,7 +15,7 @@ def get_requirements(filenames):
|
|
|
15
15
|
|
|
16
16
|
setup(
|
|
17
17
|
name='bulk_chain',
|
|
18
|
-
version='1.1
|
|
18
|
+
version='1.2.1',
|
|
19
19
|
python_requires=">=3.6",
|
|
20
20
|
description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
|
|
21
21
|
'ensuring reliable results for bulk input requests.',
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from os.path import join
|
|
3
|
+
|
|
4
|
+
from bulk_chain.api import iter_content
|
|
5
|
+
from utils import current_dir, default_remote_llm
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestAPI(unittest.TestCase):
|
|
9
|
+
|
|
10
|
+
llm = default_remote_llm()
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
def it_data(n):
|
|
14
|
+
for i in range(n):
|
|
15
|
+
yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
|
|
16
|
+
|
|
17
|
+
def test_single(self):
|
|
18
|
+
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
19
|
+
llm=self.llm,
|
|
20
|
+
batch_size=1,
|
|
21
|
+
schema=join(current_dir, "schema/default.json"))
|
|
22
|
+
|
|
23
|
+
for data in data_it:
|
|
24
|
+
print(data)
|
|
25
|
+
|
|
26
|
+
def test_single_stream(self):
|
|
27
|
+
""" Returns individual chunks.
|
|
28
|
+
"""
|
|
29
|
+
chunk_it = iter_content(input_dicts_it=self.it_data(20),
|
|
30
|
+
llm=self.llm,
|
|
31
|
+
batch_size=1,
|
|
32
|
+
stream=True,
|
|
33
|
+
schema=join(current_dir, "schema/default.json"))
|
|
34
|
+
|
|
35
|
+
for data in chunk_it:
|
|
36
|
+
print(data)
|
|
37
|
+
|
|
38
|
+
def test_batch_async(self):
|
|
39
|
+
""" Return batches that passed async at the Replicate.
|
|
40
|
+
"""
|
|
41
|
+
batch_it = iter_content(input_dicts_it=self.it_data(20),
|
|
42
|
+
llm=self.llm,
|
|
43
|
+
batch_size=5,
|
|
44
|
+
async_mode=True,
|
|
45
|
+
schema=join(current_dir, "schema/default.json"))
|
|
46
|
+
|
|
47
|
+
for batch in batch_it:
|
|
48
|
+
for item in batch:
|
|
49
|
+
print(item)
|
|
50
|
+
|
|
51
|
+
def test_batch_stream_async(self):
|
|
52
|
+
chunk_it = iter_content(input_dicts_it=self.it_data(20),
|
|
53
|
+
llm=self.llm,
|
|
54
|
+
batch_size=5,
|
|
55
|
+
stream=True,
|
|
56
|
+
async_mode=True,
|
|
57
|
+
schema=join(current_dir, "schema/default.json"))
|
|
58
|
+
|
|
59
|
+
for chunk_info in chunk_it:
|
|
60
|
+
print(chunk_info)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == '__main__':
|
|
64
|
+
unittest.main()
|
|
@@ -9,15 +9,14 @@ from utils import iter_test_jsonl_samples
|
|
|
9
9
|
|
|
10
10
|
class TestProviderBatching(unittest.TestCase):
|
|
11
11
|
|
|
12
|
-
llm = dynamic_init(class_filepath="providers/transformers_flan_t5.py"
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
llm = dynamic_init(class_filepath="providers/transformers_flan_t5.py")(
|
|
13
|
+
model_name="nicolay-r/flan-t5-tsa-thor-base",
|
|
14
|
+
max_new_tokens=128)
|
|
15
15
|
|
|
16
16
|
def test_iter(self):
|
|
17
17
|
input_dicts_it = iter_test_jsonl_samples()
|
|
18
18
|
data_it = iter_content(input_dicts_it=input_dicts_it,
|
|
19
19
|
llm=self.llm,
|
|
20
|
-
infer_mode="batch",
|
|
21
20
|
batch_size=10,
|
|
22
21
|
return_batch=False,
|
|
23
22
|
schema="schema/thor_cot_schema.json")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from bulk_chain.api import iter_content
|
|
2
|
+
from bulk_chain.core.utils import dynamic_init
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# Tested under: https://github.com/nicolay-r/nlp-thirdgate/blob/master/llm/openai_156.py
|
|
6
|
+
provider = dynamic_init(class_filepath="providers/openai_156.py")(
|
|
7
|
+
base_url="https://api.novita.ai/openai",
|
|
8
|
+
api_token="sk_qpuGSSuz4sF3PgWXg72wstp6-ojtYt1rX3pdXTVocPU",
|
|
9
|
+
model_name="meta-llama/llama-3.3-70b-instruct")
|
|
10
|
+
|
|
11
|
+
content_it = iter_content(
|
|
12
|
+
# 1. Your schema.
|
|
13
|
+
schema=[
|
|
14
|
+
{"prompt": "extract topic: {text}", "out": "topic" },
|
|
15
|
+
{"prompt": "extract subject: {text}", "out": "subject"},
|
|
16
|
+
],
|
|
17
|
+
# 2. Your third-party model implementation.
|
|
18
|
+
llm=provider,
|
|
19
|
+
batch_size=10,
|
|
20
|
+
# 3. Toggle streaming if needed
|
|
21
|
+
stream=False,
|
|
22
|
+
# 4. Toggle asynchronous mode if needed
|
|
23
|
+
async_mode=True,
|
|
24
|
+
# 5. Your iterator of dictionaries
|
|
25
|
+
input_dicts_it=[
|
|
26
|
+
{ "text": "Rocks are hard" },
|
|
27
|
+
{ "text": "Water is wet" },
|
|
28
|
+
{ "text": "Fire is hot" }
|
|
29
|
+
],
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
for batch in content_it:
|
|
33
|
+
for entry in batch:
|
|
34
|
+
print(entry)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import unittest
|
|
2
2
|
|
|
3
3
|
from bulk_chain.api import iter_content
|
|
4
|
-
from utils import iter_test_jsonl_samples,
|
|
4
|
+
from utils import iter_test_jsonl_samples, default_remote_llm
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class TestAPI_Streaming(unittest.TestCase):
|
|
@@ -10,10 +10,9 @@ class TestAPI_Streaming(unittest.TestCase):
|
|
|
10
10
|
|
|
11
11
|
input_dicts_it = iter_test_jsonl_samples()
|
|
12
12
|
data_it = iter_content(input_dicts_it=input_dicts_it,
|
|
13
|
-
llm=
|
|
13
|
+
llm=default_remote_llm(),
|
|
14
14
|
batch_size=1,
|
|
15
|
-
|
|
16
|
-
return_mode="chunk",
|
|
15
|
+
stream=True,
|
|
17
16
|
attempts=2,
|
|
18
17
|
schema="schema/thor_cot_schema.json")
|
|
19
18
|
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
from timeit import default_timer as timer
|
|
2
|
-
from utils import
|
|
2
|
+
from utils import default_remote_llm
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
llm = default_remote_llm()
|
|
3
6
|
|
|
4
7
|
start = timer()
|
|
5
|
-
r = ["".join([str(s) for s in
|
|
8
|
+
r = ["".join([str(s) for s in llm.ask(f"what's the color of the {p}")])
|
|
6
9
|
for p in ["sky", "ground", "water"]]
|
|
7
10
|
end = timer()
|
|
8
11
|
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
from timeit import default_timer as timer
|
|
2
2
|
import asyncio
|
|
3
3
|
|
|
4
|
-
from utils import
|
|
4
|
+
from utils import default_remote_llm
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
llm = default_remote_llm()
|
|
5
8
|
|
|
6
9
|
|
|
7
10
|
async def infer_item(prompt):
|
|
8
11
|
content = []
|
|
9
|
-
for chunk in
|
|
12
|
+
for chunk in llm.ask(prompt):
|
|
10
13
|
content.append(str(chunk))
|
|
11
14
|
return content
|
|
12
15
|
|
bulk_chain-1.1.0/LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2024 Nicolay Rusnachenko
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
bulk_chain-1.1.0/PKG-INFO
DELETED
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: bulk_chain
|
|
3
|
-
Version: 1.1.0
|
|
4
|
-
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
|
-
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
|
-
Author: Nicolay Rusnachenko
|
|
7
|
-
Author-email: rusnicolay@gmail.com
|
|
8
|
-
License: MIT License
|
|
9
|
-
Keywords: natural language processing,chain-of-thought,reasoning
|
|
10
|
-
Classifier: Programming Language :: Python
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
-
Requires-Python: >=3.6
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
License-File: LICENSE
|
|
18
|
-
|
|
19
|
-
# bulk-chain 1.1.0
|
|
20
|
-

|
|
21
|
-
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
22
|
-
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
23
|
-
[](https://pypistats.org/packages/bulk-chain)
|
|
24
|
-
|
|
25
|
-
<p align="center">
|
|
26
|
-
<img src="logo.png"/>
|
|
27
|
-
</p>
|
|
28
|
-
|
|
29
|
-
<p align="center">
|
|
30
|
-
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
31
|
-
<br>
|
|
32
|
-
<a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
|
|
33
|
-
</p>
|
|
34
|
-
|
|
35
|
-
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
36
|
-
|
|
37
|
-
### Main Features
|
|
38
|
-
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
39
|
-
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
40
|
-
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
41
|
-
|
|
42
|
-
# Installation
|
|
43
|
-
|
|
44
|
-
From PyPI:
|
|
45
|
-
|
|
46
|
-
```bash
|
|
47
|
-
pip install --no-deps bulk-chain
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
or latest version from here:
|
|
51
|
-
|
|
52
|
-
```bash
|
|
53
|
-
pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
## Chain-of-Thought Schema
|
|
57
|
-
|
|
58
|
-
To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
|
|
59
|
-
This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
|
|
60
|
-
|
|
61
|
-
Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
62
|
-
All the variable names are expected to be mentioned in `{}`.
|
|
63
|
-
|
|
64
|
-
Below, is an example on how to declare your own schema:
|
|
65
|
-
|
|
66
|
-
```python
|
|
67
|
-
{
|
|
68
|
-
"name": "schema-name",
|
|
69
|
-
"schema": [
|
|
70
|
-
{"prompt": "Given the question '{text}', let's think step-by-step.",
|
|
71
|
-
"out": "steps"},
|
|
72
|
-
{"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
|
|
73
|
-
"out": "answer"},
|
|
74
|
-
]
|
|
75
|
-
}
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
# Usage
|
|
79
|
-
|
|
80
|
-
## 🤖 Prepare
|
|
81
|
-
|
|
82
|
-
1. [schema](#chain-of-thought-schema)
|
|
83
|
-
* [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
|
|
84
|
-
2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
85
|
-
3. Data (iter of dictionaries)
|
|
86
|
-
|
|
87
|
-
## 🚀 Launch
|
|
88
|
-
|
|
89
|
-
> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
90
|
-
|
|
91
|
-
```python
|
|
92
|
-
from bulk_chain.core.utils import dynamic_init
|
|
93
|
-
from bulk_chain.api import iter_content
|
|
94
|
-
|
|
95
|
-
content_it = iter_content(
|
|
96
|
-
# 1. Your schema.
|
|
97
|
-
schema="YOUR_SCHEMA.json",
|
|
98
|
-
# 2. Your third-party model implementation.
|
|
99
|
-
llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
|
|
100
|
-
# 3. Customize your inference and result providing modes:
|
|
101
|
-
infer_mode="batch_async",
|
|
102
|
-
return_mode="batch",
|
|
103
|
-
# 4. Your iterator of dictionaries
|
|
104
|
-
input_dicts_it=YOUR_DATA_IT,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
for content in content_it:
|
|
108
|
-
# Handle your LLM responses here ...
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
# Embed your LLM
|
|
113
|
-
|
|
114
|
-
All you have to do is to implement `BaseLM` class, that includes:
|
|
115
|
-
* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
|
|
116
|
-
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
117
|
-
|
|
118
|
-
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: bulk_chain
|
|
3
|
-
Version: 1.1.0
|
|
4
|
-
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
|
-
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
|
-
Author: Nicolay Rusnachenko
|
|
7
|
-
Author-email: rusnicolay@gmail.com
|
|
8
|
-
License: MIT License
|
|
9
|
-
Keywords: natural language processing,chain-of-thought,reasoning
|
|
10
|
-
Classifier: Programming Language :: Python
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
-
Requires-Python: >=3.6
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
License-File: LICENSE
|
|
18
|
-
|
|
19
|
-
# bulk-chain 1.1.0
|
|
20
|
-

|
|
21
|
-
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
22
|
-
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
23
|
-
[](https://pypistats.org/packages/bulk-chain)
|
|
24
|
-
|
|
25
|
-
<p align="center">
|
|
26
|
-
<img src="logo.png"/>
|
|
27
|
-
</p>
|
|
28
|
-
|
|
29
|
-
<p align="center">
|
|
30
|
-
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
31
|
-
<br>
|
|
32
|
-
<a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
|
|
33
|
-
</p>
|
|
34
|
-
|
|
35
|
-
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
36
|
-
|
|
37
|
-
### Main Features
|
|
38
|
-
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
39
|
-
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
40
|
-
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
41
|
-
|
|
42
|
-
# Installation
|
|
43
|
-
|
|
44
|
-
From PyPI:
|
|
45
|
-
|
|
46
|
-
```bash
|
|
47
|
-
pip install --no-deps bulk-chain
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
or latest version from here:
|
|
51
|
-
|
|
52
|
-
```bash
|
|
53
|
-
pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
## Chain-of-Thought Schema
|
|
57
|
-
|
|
58
|
-
To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
|
|
59
|
-
This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
|
|
60
|
-
|
|
61
|
-
Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
62
|
-
All the variable names are expected to be mentioned in `{}`.
|
|
63
|
-
|
|
64
|
-
Below, is an example on how to declare your own schema:
|
|
65
|
-
|
|
66
|
-
```python
|
|
67
|
-
{
|
|
68
|
-
"name": "schema-name",
|
|
69
|
-
"schema": [
|
|
70
|
-
{"prompt": "Given the question '{text}', let's think step-by-step.",
|
|
71
|
-
"out": "steps"},
|
|
72
|
-
{"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
|
|
73
|
-
"out": "answer"},
|
|
74
|
-
]
|
|
75
|
-
}
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
# Usage
|
|
79
|
-
|
|
80
|
-
## 🤖 Prepare
|
|
81
|
-
|
|
82
|
-
1. [schema](#chain-of-thought-schema)
|
|
83
|
-
* [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
|
|
84
|
-
2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
85
|
-
3. Data (iter of dictionaries)
|
|
86
|
-
|
|
87
|
-
## 🚀 Launch
|
|
88
|
-
|
|
89
|
-
> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
90
|
-
|
|
91
|
-
```python
|
|
92
|
-
from bulk_chain.core.utils import dynamic_init
|
|
93
|
-
from bulk_chain.api import iter_content
|
|
94
|
-
|
|
95
|
-
content_it = iter_content(
|
|
96
|
-
# 1. Your schema.
|
|
97
|
-
schema="YOUR_SCHEMA.json",
|
|
98
|
-
# 2. Your third-party model implementation.
|
|
99
|
-
llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
|
|
100
|
-
# 3. Customize your inference and result providing modes:
|
|
101
|
-
infer_mode="batch_async",
|
|
102
|
-
return_mode="batch",
|
|
103
|
-
# 4. Your iterator of dictionaries
|
|
104
|
-
input_dicts_it=YOUR_DATA_IT,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
for content in content_it:
|
|
108
|
-
# Handle your LLM responses here ...
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
# Embed your LLM
|
|
113
|
-
|
|
114
|
-
All you have to do is to implement `BaseLM` class, that includes:
|
|
115
|
-
* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
|
|
116
|
-
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
117
|
-
|
|
118
|
-
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
from os.path import join
|
|
3
|
-
|
|
4
|
-
from bulk_chain.api import iter_content
|
|
5
|
-
from utils import current_dir, DEFAULT_REMOTE_LLM
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class TestAPI(unittest.TestCase):
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@staticmethod
|
|
12
|
-
def it_data(n):
|
|
13
|
-
for i in range(n):
|
|
14
|
-
yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
|
|
15
|
-
|
|
16
|
-
def test_single(self):
|
|
17
|
-
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
18
|
-
llm=DEFAULT_REMOTE_LLM,
|
|
19
|
-
batch_size=1,
|
|
20
|
-
infer_mode="single",
|
|
21
|
-
return_mode="batch",
|
|
22
|
-
schema=join(current_dir, "schema/default.json"))
|
|
23
|
-
|
|
24
|
-
for data in data_it:
|
|
25
|
-
print(data)
|
|
26
|
-
|
|
27
|
-
def test_single_stream(self):
|
|
28
|
-
""" Returns individual chunks.
|
|
29
|
-
"""
|
|
30
|
-
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
31
|
-
llm=DEFAULT_REMOTE_LLM,
|
|
32
|
-
batch_size=1,
|
|
33
|
-
infer_mode="single_stream",
|
|
34
|
-
return_mode="chunk",
|
|
35
|
-
schema=join(current_dir, "schema/default.json"))
|
|
36
|
-
|
|
37
|
-
for data in data_it:
|
|
38
|
-
print(data)
|
|
39
|
-
|
|
40
|
-
def test_batch_async(self):
|
|
41
|
-
""" Return batches that passed async at the Replicate.
|
|
42
|
-
"""
|
|
43
|
-
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
44
|
-
llm=DEFAULT_REMOTE_LLM,
|
|
45
|
-
batch_size=5,
|
|
46
|
-
infer_mode="batch_async",
|
|
47
|
-
return_mode="batch",
|
|
48
|
-
schema=join(current_dir, "schema/default.json"))
|
|
49
|
-
|
|
50
|
-
for batch in data_it:
|
|
51
|
-
for item in batch:
|
|
52
|
-
print(item)
|
|
53
|
-
|
|
54
|
-
def test_batch_stream_async(self):
|
|
55
|
-
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
56
|
-
llm=DEFAULT_REMOTE_LLM,
|
|
57
|
-
batch_size=5,
|
|
58
|
-
infer_mode="batch_stream_async",
|
|
59
|
-
return_mode="chunk",
|
|
60
|
-
schema=join(current_dir, "schema/default.json"))
|
|
61
|
-
|
|
62
|
-
for chunk_info in data_it:
|
|
63
|
-
print(chunk_info)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
if __name__ == '__main__':
|
|
67
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|