bulk-chain 0.25.0__tar.gz → 0.25.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/PKG-INFO +54 -16
- bulk_chain-0.25.2/README.md +127 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/api.py +22 -2
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/llm_base.py +1 -1
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_args.py +6 -1
- bulk_chain-0.25.2/bulk_chain/core/service_llm.py +94 -0
- bulk_chain-0.25.2/bulk_chain/core/utils_logger.py +41 -0
- bulk_chain-0.25.2/bulk_chain/demo.py +84 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/infer.py +5 -35
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/PKG-INFO +54 -16
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/SOURCES.txt +6 -1
- bulk_chain-0.25.2/bulk_chain.egg-info/requires.txt +2 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/setup.py +1 -1
- bulk_chain-0.25.2/test/test.py +62 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/test/test_api.py +5 -4
- bulk_chain-0.25.2/test/test_cmdargs.py +29 -0
- bulk_chain-0.25.2/test/test_provider_batching.py +31 -0
- bulk_chain-0.25.0/README.md +0 -91
- bulk_chain-0.25.0/bulk_chain/core/service_llm.py +0 -82
- bulk_chain-0.25.0/test/test_cmdargs.py +0 -20
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/LICENSE +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/__init__.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/__init__.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_batch.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_data.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_dict.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_json.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_schema.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/utils.py +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/dependency_links.txt +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/top_level.txt +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/setup.cfg +0 -0
- {bulk_chain-0.25.0 → bulk_chain-0.25.2}/test/test_args_seeking.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bulk_chain
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.2
|
|
4
4
|
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
5
|
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -15,8 +15,10 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
15
15
|
Requires-Python: >=3.6
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: source-iter==0.24.3
|
|
18
20
|
|
|
19
|
-
# bulk-chain 0.25.
|
|
21
|
+
# bulk-chain 0.25.2
|
|
20
22
|

|
|
21
23
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
22
24
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -26,7 +28,13 @@ License-File: LICENSE
|
|
|
26
28
|
<img src="logo.png"/>
|
|
27
29
|
</p>
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
<p align="center">
|
|
32
|
+
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
33
|
+
<br>
|
|
34
|
+
<a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
|
|
35
|
+
</p>
|
|
36
|
+
|
|
37
|
+
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
30
38
|
|
|
31
39
|
### Main Features
|
|
32
40
|
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
@@ -42,7 +50,7 @@ A lightweight, no-strings-attached **framework** for your LLM that allows apply
|
|
|
42
50
|
From PyPI:
|
|
43
51
|
|
|
44
52
|
```bash
|
|
45
|
-
pip install bulk-chain
|
|
53
|
+
pip install --no-deps bulk-chain
|
|
46
54
|
```
|
|
47
55
|
|
|
48
56
|
or latest version from here:
|
|
@@ -73,37 +81,67 @@ Below, is an example on how to declare your own schema:
|
|
|
73
81
|
}
|
|
74
82
|
```
|
|
75
83
|
|
|
76
|
-
Another templates are available [here](/ext/schema/).
|
|
77
|
-
|
|
78
84
|
# Usage
|
|
79
85
|
|
|
80
86
|
Preliminary steps:
|
|
81
87
|
|
|
82
88
|
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
83
|
-
2. Wrap or pick **LLM model** from the [
|
|
89
|
+
2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
84
90
|
|
|
85
|
-
##
|
|
91
|
+
## Shell
|
|
86
92
|
|
|
87
|
-
|
|
93
|
+
### Demo Mode
|
|
94
|
+
|
|
95
|
+
**demo mode** to interact with LLM via command line with LLM output streaming support.
|
|
96
|
+
The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
|
|
97
|
+
|
|
98
|
+
Quck start with launching demo:
|
|
99
|
+
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
100
|
+
2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
|
|
101
|
+
3. 🚀 Launch `demo.py` as follows:
|
|
102
|
+
```bash
|
|
103
|
+
python3 -m bulk_chain.demo \
|
|
104
|
+
--schema "test/schema/thor_cot_schema.json" \
|
|
105
|
+
--adapter "dynamic:replicate_104.py:Replicate" \
|
|
106
|
+
%%m \
|
|
107
|
+
--model_name "meta/meta-llama-3-70b-instruct" \
|
|
108
|
+
--api_token "<REPLICATE-API-TOKEN>" \
|
|
109
|
+
--stream
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
|
|
113
|
+

|
|
88
114
|
|
|
89
|
-
## Shell
|
|
90
115
|
|
|
91
|
-
|
|
116
|
+
### Inference Mode
|
|
92
117
|
|
|
118
|
+
> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
|
|
119
|
+
|
|
120
|
+
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
121
|
+
```bash
|
|
122
|
+
wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
|
|
123
|
+
```
|
|
124
|
+
2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
|
|
125
|
+
3. 🚀 Launch inference using `DeepSeek-R1`:
|
|
93
126
|
```bash
|
|
94
127
|
python3 -m bulk_chain.infer \
|
|
95
128
|
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
96
|
-
--schema "
|
|
97
|
-
--adapter "
|
|
129
|
+
--schema "test/schema/default.json" \
|
|
130
|
+
--adapter "replicate_104.py:Replicate" \
|
|
98
131
|
%%m \
|
|
99
|
-
--
|
|
100
|
-
--
|
|
132
|
+
--model_name "deepseek-ai/deepseek-r1" \
|
|
133
|
+
--api_token "<REPLICATE-API-TOKEN>"
|
|
101
134
|
```
|
|
102
135
|
|
|
136
|
+
## API
|
|
137
|
+
|
|
138
|
+
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
139
|
+
|
|
140
|
+
|
|
103
141
|
# Embed your LLM
|
|
104
142
|
|
|
105
143
|
All you have to do is to implement `BaseLM` class, that includes:
|
|
106
144
|
* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
|
|
107
145
|
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
108
146
|
|
|
109
|
-
See examples with models [
|
|
147
|
+
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# bulk-chain 0.25.2
|
|
2
|
+

|
|
3
|
+
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
4
|
+
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
5
|
+
[](https://pypistats.org/packages/bulk-chain)
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<img src="logo.png"/>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
13
|
+
<br>
|
|
14
|
+
<a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
18
|
+
|
|
19
|
+
### Main Features
|
|
20
|
+
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
21
|
+
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
22
|
+
* ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
|
|
23
|
+
|
|
24
|
+
### Extra Features
|
|
25
|
+
* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Installation
|
|
29
|
+
|
|
30
|
+
From PyPI:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install --no-deps bulk-chain
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
or latest version from here:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Chain-of-Thought Schema
|
|
43
|
+
|
|
44
|
+
To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
|
|
45
|
+
This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
|
|
46
|
+
|
|
47
|
+
Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
48
|
+
All the variable names are expected to be mentioned in `{}`.
|
|
49
|
+
|
|
50
|
+
Below, is an example on how to declare your own schema:
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
{
|
|
54
|
+
"name": "schema-name",
|
|
55
|
+
"schema": [
|
|
56
|
+
{"prompt": "Given the question '{text}', let's think step-by-step.",
|
|
57
|
+
"out": "steps"},
|
|
58
|
+
{"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
|
|
59
|
+
"out": "answer"},
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
# Usage
|
|
65
|
+
|
|
66
|
+
Preliminary steps:
|
|
67
|
+
|
|
68
|
+
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
69
|
+
2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
70
|
+
|
|
71
|
+
## Shell
|
|
72
|
+
|
|
73
|
+
### Demo Mode
|
|
74
|
+
|
|
75
|
+
**demo mode** to interact with LLM via command line with LLM output streaming support.
|
|
76
|
+
The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
|
|
77
|
+
|
|
78
|
+
Quck start with launching demo:
|
|
79
|
+
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
80
|
+
2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
|
|
81
|
+
3. 🚀 Launch `demo.py` as follows:
|
|
82
|
+
```bash
|
|
83
|
+
python3 -m bulk_chain.demo \
|
|
84
|
+
--schema "test/schema/thor_cot_schema.json" \
|
|
85
|
+
--adapter "dynamic:replicate_104.py:Replicate" \
|
|
86
|
+
%%m \
|
|
87
|
+
--model_name "meta/meta-llama-3-70b-instruct" \
|
|
88
|
+
--api_token "<REPLICATE-API-TOKEN>" \
|
|
89
|
+
--stream
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
|
|
93
|
+

|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
### Inference Mode
|
|
97
|
+
|
|
98
|
+
> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
|
|
99
|
+
|
|
100
|
+
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
101
|
+
```bash
|
|
102
|
+
wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
|
|
103
|
+
```
|
|
104
|
+
2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
|
|
105
|
+
3. 🚀 Launch inference using `DeepSeek-R1`:
|
|
106
|
+
```bash
|
|
107
|
+
python3 -m bulk_chain.infer \
|
|
108
|
+
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
109
|
+
--schema "test/schema/default.json" \
|
|
110
|
+
--adapter "replicate_104.py:Replicate" \
|
|
111
|
+
%%m \
|
|
112
|
+
--model_name "deepseek-ai/deepseek-r1" \
|
|
113
|
+
--api_token "<REPLICATE-API-TOKEN>"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## API
|
|
117
|
+
|
|
118
|
+
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Embed your LLM
|
|
122
|
+
|
|
123
|
+
All you have to do is to implement `BaseLM` class, that includes:
|
|
124
|
+
* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
|
|
125
|
+
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
126
|
+
|
|
127
|
+
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
@@ -7,7 +7,7 @@ from bulk_chain.core.service_data import DataService
|
|
|
7
7
|
from bulk_chain.core.service_dict import DictionaryService
|
|
8
8
|
from bulk_chain.core.service_json import JsonService
|
|
9
9
|
from bulk_chain.core.service_schema import SchemaService
|
|
10
|
-
|
|
10
|
+
from bulk_chain.core.utils import dynamic_init, find_by_prefix
|
|
11
11
|
|
|
12
12
|
INFER_MODES = {
|
|
13
13
|
"default": lambda llm, prompt, limit_prompt=None: llm.ask_core(
|
|
@@ -76,4 +76,24 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, l
|
|
|
76
76
|
schema=schema)
|
|
77
77
|
for batch in BatchIterator(prompts_it, batch_size=batch_size))
|
|
78
78
|
|
|
79
|
-
yield from content_it if return_batch else chain.from_iterable(content_it)
|
|
79
|
+
yield from content_it if return_batch else chain.from_iterable(content_it)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def init_llm(adapter, **model_kwargs):
|
|
83
|
+
""" This method perform dynamic initialization of LLM from third-party resource.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
# List of the Supported models and their API wrappers.
|
|
87
|
+
models_preset = {
|
|
88
|
+
"dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
|
|
89
|
+
class_name=llm_model_params)(**model_kwargs)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Initialize LLM model.
|
|
93
|
+
params = adapter.split(':')
|
|
94
|
+
llm_model_type = params[0]
|
|
95
|
+
llm_model_name = params[1] if len(params) > 1 else params[-1]
|
|
96
|
+
llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
|
|
97
|
+
llm = find_by_prefix(d=models_preset, key=llm_model_type)()
|
|
98
|
+
|
|
99
|
+
return llm, llm_model_name
|
|
@@ -12,6 +12,11 @@ class CmdArgsService:
|
|
|
12
12
|
def iter_arguments(lst):
|
|
13
13
|
|
|
14
14
|
def __release():
|
|
15
|
+
|
|
16
|
+
# We use the True value by default to treat the related parameter as flag.
|
|
17
|
+
if len(buf) == 0:
|
|
18
|
+
buf.append(True)
|
|
19
|
+
|
|
15
20
|
return key, buf if len(buf) > 1 else buf[0]
|
|
16
21
|
|
|
17
22
|
key = None
|
|
@@ -29,7 +34,7 @@ class CmdArgsService:
|
|
|
29
34
|
buf.append(a)
|
|
30
35
|
|
|
31
36
|
# Sharing the remaining params.
|
|
32
|
-
if
|
|
37
|
+
if key is not None:
|
|
33
38
|
yield __release()
|
|
34
39
|
|
|
35
40
|
@staticmethod
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from bulk_chain.core.llm_base import BaseLM
|
|
2
|
+
from bulk_chain.core.service_data import DataService
|
|
3
|
+
from bulk_chain.core.utils import iter_params
|
|
4
|
+
from bulk_chain.core.utils_logger import StreamedLogger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def pad_str(text, pad):
|
|
8
|
+
return text.rjust(len(text) + pad, ' ')
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def nice_output(text, remove_new_line=False):
|
|
12
|
+
short_text = text.replace("\n", "") if remove_new_line else text
|
|
13
|
+
return short_text
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def chat_with_lm(lm, preset_dict=None, chain=None, model_name=None, pad=0):
|
|
17
|
+
assert (isinstance(lm, BaseLM))
|
|
18
|
+
assert (isinstance(chain, list))
|
|
19
|
+
assert (isinstance(model_name, str) or model_name is None)
|
|
20
|
+
|
|
21
|
+
preset_dict = {} if preset_dict is None else preset_dict
|
|
22
|
+
|
|
23
|
+
streamed_logger = StreamedLogger(__name__)
|
|
24
|
+
|
|
25
|
+
do_exit = False
|
|
26
|
+
model_name = model_name if model_name is not None else "agent"
|
|
27
|
+
|
|
28
|
+
while not do_exit:
|
|
29
|
+
|
|
30
|
+
streamed_logger.info("----------------")
|
|
31
|
+
streamed_logger.info("\n")
|
|
32
|
+
|
|
33
|
+
# Launching the CoT engine loop.
|
|
34
|
+
data_dict = {} | preset_dict
|
|
35
|
+
for chain_ind, prompt_args in enumerate(chain):
|
|
36
|
+
|
|
37
|
+
# Processing the prompt.
|
|
38
|
+
prompt = prompt_args["prompt"]
|
|
39
|
+
|
|
40
|
+
# Filling necessary parameters.
|
|
41
|
+
user_informed = False
|
|
42
|
+
field_names = list(iter_params(prompt))
|
|
43
|
+
for ind, f_name in enumerate(field_names):
|
|
44
|
+
|
|
45
|
+
if f_name in data_dict:
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
user_input = input(f"Enter your prompt for `{f_name}` ({ind+1}/{len(field_names)}) "
|
|
49
|
+
f"(or 'exit' to quit): ")
|
|
50
|
+
user_informed = True
|
|
51
|
+
|
|
52
|
+
if user_input.lower() == 'exit':
|
|
53
|
+
do_exit = True
|
|
54
|
+
break
|
|
55
|
+
|
|
56
|
+
data_dict[f_name] = user_input
|
|
57
|
+
|
|
58
|
+
if do_exit:
|
|
59
|
+
break
|
|
60
|
+
|
|
61
|
+
# In the case of the initial interaction with the chain.
|
|
62
|
+
# we make sure that aware user for starting interaction.
|
|
63
|
+
if chain_ind == 0 and not user_informed:
|
|
64
|
+
user_input = input(f"Enter to continue (or 'exit' to quit) ...")
|
|
65
|
+
if user_input.lower() == 'exit':
|
|
66
|
+
do_exit = True
|
|
67
|
+
|
|
68
|
+
# Finally asking LLM.
|
|
69
|
+
DataService.compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
|
|
70
|
+
actual_prompt = DataService.get_prompt_text(prompt=prompt, data_dict=data_dict)
|
|
71
|
+
|
|
72
|
+
# Returning meta information, passed to LLM.
|
|
73
|
+
streamed_logger.info(pad_str(f"{model_name} (ask [{chain_ind+1}/{len(chain)}]) ->", pad=pad))
|
|
74
|
+
streamed_logger.info("\n")
|
|
75
|
+
streamed_logger.info(nice_output(actual_prompt, remove_new_line=True))
|
|
76
|
+
streamed_logger.info("\n\n")
|
|
77
|
+
|
|
78
|
+
# Response.
|
|
79
|
+
response = lm.ask_core(batch=[actual_prompt])[0]
|
|
80
|
+
streamed_logger.info(pad_str(f"{model_name} (resp [{chain_ind+1}/{len(chain)}])->", pad=pad))
|
|
81
|
+
streamed_logger.info("\n")
|
|
82
|
+
if isinstance(response, str):
|
|
83
|
+
streamed_logger.info(nice_output(response, remove_new_line=False))
|
|
84
|
+
buffer = [response]
|
|
85
|
+
else:
|
|
86
|
+
buffer = []
|
|
87
|
+
for chunk in response:
|
|
88
|
+
streamed_logger.info(chunk)
|
|
89
|
+
buffer.append(str(chunk))
|
|
90
|
+
|
|
91
|
+
streamed_logger.info("\n\n")
|
|
92
|
+
|
|
93
|
+
# Collecting the answer for the next turn.
|
|
94
|
+
data_dict[prompt_args["out"]] = "".join(buffer)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def StreamedLogger(name: str) -> logging.Logger:
|
|
5
|
+
""" https://medium.com/@r.das699/optimizing-logging-practices-for-streaming-data-in-python-521798e1ed82
|
|
6
|
+
"""
|
|
7
|
+
root_handlers = logging.getLogger().handlers
|
|
8
|
+
current_logger = logging.getLogger(name)
|
|
9
|
+
if not root_handlers:
|
|
10
|
+
new_handler = logging.StreamHandler()
|
|
11
|
+
new_handler.terminator = ""
|
|
12
|
+
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
13
|
+
current_logger.addHandler(new_handler)
|
|
14
|
+
current_logger.propagate = False
|
|
15
|
+
current_logger.setLevel(logging.INFO)
|
|
16
|
+
return current_logger
|
|
17
|
+
|
|
18
|
+
for handler in current_logger.handlers[:]:
|
|
19
|
+
current_logger.removeHandler(handler)
|
|
20
|
+
|
|
21
|
+
for handler_r in root_handlers:
|
|
22
|
+
if type(handler_r) is logging.StreamHandler:
|
|
23
|
+
new_handler = logging.StreamHandler()
|
|
24
|
+
new_handler.terminator = ""
|
|
25
|
+
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
26
|
+
current_logger.addHandler(new_handler)
|
|
27
|
+
elif type(handler_r) is logging.FileHandler:
|
|
28
|
+
new_handler = logging.FileHandler(
|
|
29
|
+
handler_r.baseFilename,
|
|
30
|
+
handler_r.mode,
|
|
31
|
+
handler_r.encoding,
|
|
32
|
+
handler_r.delay,
|
|
33
|
+
handler_r.errors,
|
|
34
|
+
)
|
|
35
|
+
new_handler.terminator = "" # This will stop the printing in new line
|
|
36
|
+
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
37
|
+
current_logger.addHandler(new_handler)
|
|
38
|
+
else:
|
|
39
|
+
continue
|
|
40
|
+
current_logger.propagate = False # Don't propagate to root logger
|
|
41
|
+
return current_logger
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from source_iter.service_jsonl import JsonlService
|
|
8
|
+
|
|
9
|
+
from bulk_chain.api import init_llm
|
|
10
|
+
from bulk_chain.core.service_args import CmdArgsService
|
|
11
|
+
from bulk_chain.core.service_json import JsonService
|
|
12
|
+
from bulk_chain.core.service_llm import chat_with_lm
|
|
13
|
+
from bulk_chain.core.service_schema import SchemaService
|
|
14
|
+
from bulk_chain.core.utils import parse_filepath
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
logging.basicConfig(level=logging.INFO)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def iter_from_json(filepath):
|
|
21
|
+
with open(filepath, "r") as f:
|
|
22
|
+
content = json.load(f)
|
|
23
|
+
for key, value in content.items():
|
|
24
|
+
yield key, value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def iter_from_text_file(filepath):
|
|
28
|
+
with open(filepath, "r") as f:
|
|
29
|
+
yield filepath.split('.')[0], f.read()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
if __name__ == '__main__':
|
|
33
|
+
|
|
34
|
+
parser = argparse.ArgumentParser(description="LLM demo usage based on CoT schema")
|
|
35
|
+
parser.add_argument('--adapter', dest='adapter', type=str, default=None)
|
|
36
|
+
parser.add_argument('--attempts', dest='attempts', type=int, default=None)
|
|
37
|
+
parser.add_argument('--src', dest='src', type=str, nargs="*", default=None)
|
|
38
|
+
parser.add_argument('--schema', dest='schema', type=str, default=None,
|
|
39
|
+
help="Path to the JSON file that describes schema")
|
|
40
|
+
parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
|
|
41
|
+
help="Optional trimming prompt by the specified amount of characters.")
|
|
42
|
+
|
|
43
|
+
# Extract native arguments.
|
|
44
|
+
native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
|
|
45
|
+
args = parser.parse_args(args=native_args[1:])
|
|
46
|
+
|
|
47
|
+
# Extract model-related arguments and Initialize Large Language Model.
|
|
48
|
+
model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
|
|
49
|
+
model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
|
|
50
|
+
llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
|
|
51
|
+
|
|
52
|
+
# Setup schema.
|
|
53
|
+
schema = SchemaService(json_data=JsonService.read(args.schema))
|
|
54
|
+
schema_name = schema.src.get("name", None)
|
|
55
|
+
if schema is not None:
|
|
56
|
+
logger.info(f"Using schema: {schema_name}")
|
|
57
|
+
|
|
58
|
+
output_providers = {
|
|
59
|
+
"jsonl": lambda filepath, data_it, header:
|
|
60
|
+
JsonlService.write(target=filepath,
|
|
61
|
+
data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
input_file_handlers = {
|
|
65
|
+
"json": lambda filepath: iter_from_json(filepath),
|
|
66
|
+
"txt": lambda filepath: iter_from_text_file(filepath)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Input extension type defines the provider.
|
|
70
|
+
if args.src is None:
|
|
71
|
+
args.src = []
|
|
72
|
+
if isinstance(args.src, str):
|
|
73
|
+
args.src = [args.src]
|
|
74
|
+
sources = [parse_filepath(s) for s in args.src]
|
|
75
|
+
|
|
76
|
+
preset_dict = {}
|
|
77
|
+
for fp, ext, _ in sources:
|
|
78
|
+
for key, value in input_file_handlers[ext](fp):
|
|
79
|
+
if key in preset_dict:
|
|
80
|
+
raise Exception(f"While at handling {fp}: Key {key} is already registered!")
|
|
81
|
+
preset_dict[key] = value
|
|
82
|
+
|
|
83
|
+
# Launch Demo.
|
|
84
|
+
chat_with_lm(llm, preset_dict=preset_dict, chain=schema.chain, model_name=llm_model_name)
|
|
@@ -9,51 +9,28 @@ from source_iter.service_jsonl import JsonlService
|
|
|
9
9
|
from source_iter.service_sqlite import SQLite3Service
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
|
-
from bulk_chain.api import INFER_MODES, _infer_batch, CWD
|
|
12
|
+
from bulk_chain.api import INFER_MODES, _infer_batch, CWD, init_llm
|
|
13
13
|
from bulk_chain.core.llm_base import BaseLM
|
|
14
14
|
from bulk_chain.core.service_args import CmdArgsService
|
|
15
15
|
from bulk_chain.core.service_dict import DictionaryService
|
|
16
16
|
from bulk_chain.core.service_json import JsonService
|
|
17
|
-
from bulk_chain.core.service_llm import chat_with_lm
|
|
18
17
|
from bulk_chain.core.service_schema import SchemaService
|
|
19
|
-
from bulk_chain.core.utils import
|
|
18
|
+
from bulk_chain.core.utils import handle_table_name, optional_limit_iter, parse_filepath
|
|
20
19
|
|
|
21
20
|
logger = logging.getLogger(__name__)
|
|
22
21
|
logging.basicConfig(level=logging.INFO)
|
|
23
22
|
|
|
24
|
-
|
|
25
23
|
WRITER_PROVIDERS = {
|
|
26
24
|
"sqlite": lambda filepath, table_name, data_it, infer_data_func, **kwargs: SQLite3Service.write(
|
|
27
25
|
data_it=data_it, target=filepath, table_name=table_name, data2col_func=infer_data_func,
|
|
28
26
|
skip_existed=True, **kwargs)
|
|
29
27
|
}
|
|
30
28
|
|
|
31
|
-
|
|
32
29
|
READER_PROVIDERS = {
|
|
33
30
|
"sqlite": lambda filepath, table_name: SQLite3Service.read(filepath, table=table_name)
|
|
34
31
|
}
|
|
35
32
|
|
|
36
33
|
|
|
37
|
-
def init_llm(**model_kwargs):
|
|
38
|
-
""" This method perform dynamic initialization of LLM from third-party resource.
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
# List of the Supported models and their API wrappers.
|
|
42
|
-
models_preset = {
|
|
43
|
-
"dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
|
|
44
|
-
class_name=llm_model_params)(**model_kwargs)
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
# Initialize LLM model.
|
|
48
|
-
params = args.adapter.split(':')
|
|
49
|
-
llm_model_type = params[0]
|
|
50
|
-
llm_model_name = params[1] if len(params) > 1 else params[-1]
|
|
51
|
-
llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
|
|
52
|
-
llm = find_by_prefix(d=models_preset, key=llm_model_type)()
|
|
53
|
-
|
|
54
|
-
return llm, llm_model_name
|
|
55
|
-
|
|
56
|
-
|
|
57
34
|
def iter_content_cached(input_dicts_it, llm, schema, cache_target, limit_prompt=None, **cache_kwargs):
|
|
58
35
|
assert (isinstance(llm, BaseLM))
|
|
59
36
|
assert (isinstance(cache_target, str))
|
|
@@ -91,9 +68,8 @@ if __name__ == '__main__':
|
|
|
91
68
|
|
|
92
69
|
parser = argparse.ArgumentParser(description="Infer Instruct LLM inference based on CoT schema")
|
|
93
70
|
parser.add_argument('--adapter', dest='adapter', type=str, default=None)
|
|
94
|
-
parser.add_argument('--attempts', dest='attempts', type=int, default=None)
|
|
95
71
|
parser.add_argument('--id-col', dest='id_col', type=str, default="uid")
|
|
96
|
-
parser.add_argument('--src', dest='src', type=str, default=None)
|
|
72
|
+
parser.add_argument('--src', dest='src', type=str, nargs="?", default=None)
|
|
97
73
|
parser.add_argument('--schema', dest='schema', type=str, default=None,
|
|
98
74
|
help="Path to the JSON file that describes schema")
|
|
99
75
|
parser.add_argument('--to', dest='to', type=str, default=None, choices=["csv", "sqlite"])
|
|
@@ -114,7 +90,7 @@ if __name__ == '__main__':
|
|
|
114
90
|
# Extract model-related arguments and Initialize Large Language Model.
|
|
115
91
|
model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
|
|
116
92
|
model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
|
|
117
|
-
llm, llm_model_name = init_llm(**model_args_dict)
|
|
93
|
+
llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
|
|
118
94
|
|
|
119
95
|
# Setup schema.
|
|
120
96
|
schema = SchemaService(json_data=JsonService.read(args.schema))
|
|
@@ -123,7 +99,6 @@ if __name__ == '__main__':
|
|
|
123
99
|
logger.info(f"Using schema: {schema_name}")
|
|
124
100
|
|
|
125
101
|
input_providers = {
|
|
126
|
-
None: lambda _: chat_with_lm(llm, chain=schema.chain, model_name=llm_model_name),
|
|
127
102
|
"csv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
|
|
128
103
|
as_dict=True, skip_header=True,
|
|
129
104
|
delimiter=csv_args_dict.get("delimiter", ","),
|
|
@@ -155,14 +130,9 @@ if __name__ == '__main__':
|
|
|
155
130
|
args.output = args.output.format(model=llm.name()) if args.output is not None else args.output
|
|
156
131
|
tgt_filepath, tgt_ext, tgt_meta = parse_filepath(args.output, default_ext=args.to)
|
|
157
132
|
|
|
158
|
-
#
|
|
133
|
+
# We do not support multiple files for other modes.
|
|
159
134
|
src_filepath, src_ext, src_meta = parse_filepath(args.src)
|
|
160
135
|
|
|
161
|
-
# Check whether we are in chat mode.
|
|
162
|
-
if src_ext is None:
|
|
163
|
-
input_providers[src_ext](None)
|
|
164
|
-
exit(0)
|
|
165
|
-
|
|
166
136
|
def default_output_file_template(ext):
|
|
167
137
|
# This is a default template for output files to be generated.
|
|
168
138
|
return "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema_name]), ext])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bulk_chain
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.2
|
|
4
4
|
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
5
|
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -15,8 +15,10 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
15
15
|
Requires-Python: >=3.6
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: source-iter==0.24.3
|
|
18
20
|
|
|
19
|
-
# bulk-chain 0.25.
|
|
21
|
+
# bulk-chain 0.25.2
|
|
20
22
|

|
|
21
23
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
22
24
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -26,7 +28,13 @@ License-File: LICENSE
|
|
|
26
28
|
<img src="logo.png"/>
|
|
27
29
|
</p>
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
<p align="center">
|
|
32
|
+
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
33
|
+
<br>
|
|
34
|
+
<a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
|
|
35
|
+
</p>
|
|
36
|
+
|
|
37
|
+
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
30
38
|
|
|
31
39
|
### Main Features
|
|
32
40
|
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
@@ -42,7 +50,7 @@ A lightweight, no-strings-attached **framework** for your LLM that allows apply
|
|
|
42
50
|
From PyPI:
|
|
43
51
|
|
|
44
52
|
```bash
|
|
45
|
-
pip install bulk-chain
|
|
53
|
+
pip install --no-deps bulk-chain
|
|
46
54
|
```
|
|
47
55
|
|
|
48
56
|
or latest version from here:
|
|
@@ -73,37 +81,67 @@ Below, is an example on how to declare your own schema:
|
|
|
73
81
|
}
|
|
74
82
|
```
|
|
75
83
|
|
|
76
|
-
Another templates are available [here](/ext/schema/).
|
|
77
|
-
|
|
78
84
|
# Usage
|
|
79
85
|
|
|
80
86
|
Preliminary steps:
|
|
81
87
|
|
|
82
88
|
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
83
|
-
2. Wrap or pick **LLM model** from the [
|
|
89
|
+
2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
84
90
|
|
|
85
|
-
##
|
|
91
|
+
## Shell
|
|
86
92
|
|
|
87
|
-
|
|
93
|
+
### Demo Mode
|
|
94
|
+
|
|
95
|
+
**demo mode** to interact with LLM via command line with LLM output streaming support.
|
|
96
|
+
The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
|
|
97
|
+
|
|
98
|
+
Quck start with launching demo:
|
|
99
|
+
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
100
|
+
2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
|
|
101
|
+
3. 🚀 Launch `demo.py` as follows:
|
|
102
|
+
```bash
|
|
103
|
+
python3 -m bulk_chain.demo \
|
|
104
|
+
--schema "test/schema/thor_cot_schema.json" \
|
|
105
|
+
--adapter "dynamic:replicate_104.py:Replicate" \
|
|
106
|
+
%%m \
|
|
107
|
+
--model_name "meta/meta-llama-3-70b-instruct" \
|
|
108
|
+
--api_token "<REPLICATE-API-TOKEN>" \
|
|
109
|
+
--stream
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
|
|
113
|
+

|
|
88
114
|
|
|
89
|
-
## Shell
|
|
90
115
|
|
|
91
|
-
|
|
116
|
+
### Inference Mode
|
|
92
117
|
|
|
118
|
+
> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
|
|
119
|
+
|
|
120
|
+
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
121
|
+
```bash
|
|
122
|
+
wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
|
|
123
|
+
```
|
|
124
|
+
2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
|
|
125
|
+
3. 🚀 Launch inference using `DeepSeek-R1`:
|
|
93
126
|
```bash
|
|
94
127
|
python3 -m bulk_chain.infer \
|
|
95
128
|
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
96
|
-
--schema "
|
|
97
|
-
--adapter "
|
|
129
|
+
--schema "test/schema/default.json" \
|
|
130
|
+
--adapter "replicate_104.py:Replicate" \
|
|
98
131
|
%%m \
|
|
99
|
-
--
|
|
100
|
-
--
|
|
132
|
+
--model_name "deepseek-ai/deepseek-r1" \
|
|
133
|
+
--api_token "<REPLICATE-API-TOKEN>"
|
|
101
134
|
```
|
|
102
135
|
|
|
136
|
+
## API
|
|
137
|
+
|
|
138
|
+
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
139
|
+
|
|
140
|
+
|
|
103
141
|
# Embed your LLM
|
|
104
142
|
|
|
105
143
|
All you have to do is to implement `BaseLM` class, that includes:
|
|
106
144
|
* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
|
|
107
145
|
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
108
146
|
|
|
109
|
-
See examples with models [
|
|
147
|
+
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
@@ -3,10 +3,12 @@ README.md
|
|
|
3
3
|
setup.py
|
|
4
4
|
bulk_chain/__init__.py
|
|
5
5
|
bulk_chain/api.py
|
|
6
|
+
bulk_chain/demo.py
|
|
6
7
|
bulk_chain/infer.py
|
|
7
8
|
bulk_chain.egg-info/PKG-INFO
|
|
8
9
|
bulk_chain.egg-info/SOURCES.txt
|
|
9
10
|
bulk_chain.egg-info/dependency_links.txt
|
|
11
|
+
bulk_chain.egg-info/requires.txt
|
|
10
12
|
bulk_chain.egg-info/top_level.txt
|
|
11
13
|
bulk_chain/core/__init__.py
|
|
12
14
|
bulk_chain/core/llm_base.py
|
|
@@ -18,6 +20,9 @@ bulk_chain/core/service_json.py
|
|
|
18
20
|
bulk_chain/core/service_llm.py
|
|
19
21
|
bulk_chain/core/service_schema.py
|
|
20
22
|
bulk_chain/core/utils.py
|
|
23
|
+
bulk_chain/core/utils_logger.py
|
|
24
|
+
test/test.py
|
|
21
25
|
test/test_api.py
|
|
22
26
|
test/test_args_seeking.py
|
|
23
|
-
test/test_cmdargs.py
|
|
27
|
+
test/test_cmdargs.py
|
|
28
|
+
test/test_provider_batching.py
|
|
@@ -15,7 +15,7 @@ def get_requirements(filenames):
|
|
|
15
15
|
|
|
16
16
|
setup(
|
|
17
17
|
name='bulk_chain',
|
|
18
|
-
version='0.25.
|
|
18
|
+
version='0.25.2',
|
|
19
19
|
python_requires=">=3.6",
|
|
20
20
|
description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
|
|
21
21
|
'ensuring reliable results for bulk input requests.',
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import time
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def setup_logger_behaviour(name: str) -> logging.Logger:
|
|
7
|
+
root_handlers = logging.getLogger().handlers # gets root logger
|
|
8
|
+
current_logger = logging.getLogger(name) # gets current logger
|
|
9
|
+
if not root_handlers: # if root logger has no handlers then create streaming handeler only
|
|
10
|
+
new_handler = logging.StreamHandler()
|
|
11
|
+
new_handler.terminator = ""
|
|
12
|
+
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
13
|
+
current_logger.addHandler(new_handler)
|
|
14
|
+
current_logger.propagate = False
|
|
15
|
+
current_logger.setLevel(logging.INFO)
|
|
16
|
+
return current_logger
|
|
17
|
+
|
|
18
|
+
# Remove exixting Handlers from the current logger
|
|
19
|
+
for handler in current_logger.handlers[:]:
|
|
20
|
+
current_logger.removeHandler(handler)
|
|
21
|
+
|
|
22
|
+
for handler_r in root_handlers: # if root logger has handlers
|
|
23
|
+
if type(handler_r) is logging.StreamHandler: # if root logger has streaming handler
|
|
24
|
+
new_handler = logging.StreamHandler()
|
|
25
|
+
new_handler.terminator = "" # This will stop the printing in new line
|
|
26
|
+
new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
|
|
27
|
+
current_logger.addHandler(new_handler)
|
|
28
|
+
elif type(handler_r) is logging.FileHandler: # if root logger has file handler
|
|
29
|
+
new_handler = logging.FileHandler( # create new file handler
|
|
30
|
+
handler_r.baseFilename, # with same filename and other properties
|
|
31
|
+
handler_r.mode,
|
|
32
|
+
handler_r.encoding,
|
|
33
|
+
handler_r.delay,
|
|
34
|
+
handler_r.errors,
|
|
35
|
+
)
|
|
36
|
+
new_handler.terminator = "" # This will stop the printing in new line
|
|
37
|
+
new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
|
|
38
|
+
current_logger.addHandler(new_handler)
|
|
39
|
+
else:
|
|
40
|
+
continue
|
|
41
|
+
current_logger.propagate = False # Don't propagate to root logger
|
|
42
|
+
return current_logger
|
|
43
|
+
|
|
44
|
+
# Configure the logger
|
|
45
|
+
logger =logging.getLogger(__name__)
|
|
46
|
+
class FakeStreamingDataGenerator:
|
|
47
|
+
|
|
48
|
+
def stream_data(self):
|
|
49
|
+
while True:
|
|
50
|
+
data = random.randint(0, 100)
|
|
51
|
+
yield data
|
|
52
|
+
time.sleep(0.5)
|
|
53
|
+
|
|
54
|
+
# Example usage:
|
|
55
|
+
generator = FakeStreamingDataGenerator()
|
|
56
|
+
stream = generator.stream_data()
|
|
57
|
+
|
|
58
|
+
logger = setup_logger_behaviour(__name__) # call you set up function here
|
|
59
|
+
while True:
|
|
60
|
+
chunk = next(stream)
|
|
61
|
+
# Replacing print with logger
|
|
62
|
+
logger.info(chunk) # Best practice now
|
|
@@ -9,8 +9,9 @@ from bulk_chain.infer import iter_content_cached
|
|
|
9
9
|
class TestAPI(unittest.TestCase):
|
|
10
10
|
|
|
11
11
|
llm = dynamic_init(class_dir=join(CWD, ".."),
|
|
12
|
-
class_filepath="
|
|
13
|
-
class_name="Replicate")(api_token="<API-KEY>"
|
|
12
|
+
class_filepath="providers/replicate_104.py",
|
|
13
|
+
class_name="Replicate")(api_token="<API-KEY>",
|
|
14
|
+
model_name="deepseek-ai/deepseek-r1")
|
|
14
15
|
|
|
15
16
|
def it_data(self, n):
|
|
16
17
|
for i in range(n):
|
|
@@ -19,7 +20,7 @@ class TestAPI(unittest.TestCase):
|
|
|
19
20
|
def test_iter_cached(self):
|
|
20
21
|
data_it = iter_content_cached(input_dicts_it=self.it_data(20),
|
|
21
22
|
llm=self.llm,
|
|
22
|
-
schema="../
|
|
23
|
+
schema="../schema/default.json",
|
|
23
24
|
# Cache-related extra parameters.
|
|
24
25
|
cache_target="out.sqlite:content",
|
|
25
26
|
id_column_name="ind")
|
|
@@ -32,7 +33,7 @@ class TestAPI(unittest.TestCase):
|
|
|
32
33
|
llm=self.llm,
|
|
33
34
|
batch_size=1,
|
|
34
35
|
return_batch=True,
|
|
35
|
-
schema="../
|
|
36
|
+
schema="../schema/default.json")
|
|
36
37
|
|
|
37
38
|
for data in data_it:
|
|
38
39
|
print(data)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import unittest
|
|
3
|
+
|
|
4
|
+
from bulk_chain.core.service_args import CmdArgsService
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestCmdArgs(unittest.TestCase):
|
|
8
|
+
|
|
9
|
+
def test(self):
|
|
10
|
+
|
|
11
|
+
# Csv-related.
|
|
12
|
+
csv_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%csv", end_prefix="%%")
|
|
13
|
+
print(csv_args)
|
|
14
|
+
csv_args = CmdArgsService.args_to_dict(csv_args)
|
|
15
|
+
print("csv\t", csv_args)
|
|
16
|
+
|
|
17
|
+
# Model-related.
|
|
18
|
+
m_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%m", end_prefix="%%")
|
|
19
|
+
m_args = CmdArgsService.args_to_dict(m_args)
|
|
20
|
+
print("mod\t", m_args)
|
|
21
|
+
|
|
22
|
+
# native.
|
|
23
|
+
n_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
|
|
24
|
+
n_args = CmdArgsService.args_to_dict(n_args)
|
|
25
|
+
print("nat\t", n_args)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == '__main__':
|
|
29
|
+
unittest.main()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from os.path import join
|
|
3
|
+
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
from bulk_chain.api import CWD, iter_content
|
|
7
|
+
from bulk_chain.core.utils import dynamic_init
|
|
8
|
+
from utils import iter_test_jsonl_samples
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestProviderBatching(unittest.TestCase):
|
|
12
|
+
|
|
13
|
+
llm = dynamic_init(class_dir=join(CWD, ".."),
|
|
14
|
+
class_filepath="providers/transformers_flan_t5.py",
|
|
15
|
+
class_name="FlanT5")(model_name="nicolay-r/flan-t5-tsa-thor-base",
|
|
16
|
+
max_new_tokens=128)
|
|
17
|
+
|
|
18
|
+
def test_iter(self):
|
|
19
|
+
input_dicts_it = iter_test_jsonl_samples()
|
|
20
|
+
data_it = iter_content(input_dicts_it=input_dicts_it,
|
|
21
|
+
llm=self.llm,
|
|
22
|
+
batch_size=20,
|
|
23
|
+
return_batch=False,
|
|
24
|
+
schema="schema/default.json")
|
|
25
|
+
|
|
26
|
+
for item in tqdm(data_it):
|
|
27
|
+
print(item)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == '__main__':
|
|
31
|
+
unittest.main()
|
bulk_chain-0.25.0/README.md
DELETED
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
# bulk-chain 0.25.0
|
|
2
|
-

|
|
3
|
-
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
4
|
-
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
5
|
-
[](https://pypistats.org/packages/bulk-chain)
|
|
6
|
-
|
|
7
|
-
<p align="center">
|
|
8
|
-
<img src="logo.png"/>
|
|
9
|
-
</p>
|
|
10
|
-
|
|
11
|
-
A lightweight, no-strings-attached **framework** for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
|
|
12
|
-
|
|
13
|
-
### Main Features
|
|
14
|
-
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
15
|
-
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
16
|
-
* ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
|
|
17
|
-
|
|
18
|
-
### Extra Features
|
|
19
|
-
* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# Installation
|
|
23
|
-
|
|
24
|
-
From PyPI:
|
|
25
|
-
|
|
26
|
-
```bash
|
|
27
|
-
pip install bulk-chain
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
or latest version from here:
|
|
31
|
-
|
|
32
|
-
```bash
|
|
33
|
-
pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## Chain-of-Thought Schema
|
|
37
|
-
|
|
38
|
-
To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
|
|
39
|
-
This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
|
|
40
|
-
|
|
41
|
-
Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
42
|
-
All the variable names are expected to be mentioned in `{}`.
|
|
43
|
-
|
|
44
|
-
Below, is an example on how to declare your own schema:
|
|
45
|
-
|
|
46
|
-
```python
|
|
47
|
-
{
|
|
48
|
-
"name": "schema-name",
|
|
49
|
-
"schema": [
|
|
50
|
-
{"prompt": "Given the question '{text}', let's think step-by-step.",
|
|
51
|
-
"out": "steps"},
|
|
52
|
-
{"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
|
|
53
|
-
"out": "answer"},
|
|
54
|
-
]
|
|
55
|
-
}
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
Another templates are available [here](/ext/schema/).
|
|
59
|
-
|
|
60
|
-
# Usage
|
|
61
|
-
|
|
62
|
-
Preliminary steps:
|
|
63
|
-
|
|
64
|
-
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
65
|
-
2. Wrap or pick **LLM model** from the [list of presets](/ext/).
|
|
66
|
-
|
|
67
|
-
## API
|
|
68
|
-
|
|
69
|
-
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
70
|
-
|
|
71
|
-
## Shell
|
|
72
|
-
|
|
73
|
-
> **NOTE:** You have to install `source-iter` package
|
|
74
|
-
|
|
75
|
-
```bash
|
|
76
|
-
python3 -m bulk_chain.infer \
|
|
77
|
-
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
78
|
-
--schema "ext/schema/default.json" \
|
|
79
|
-
--adapter "dynamic:ext/replicate.py:Replicate" \
|
|
80
|
-
%%m \
|
|
81
|
-
--api_token "<REPLICATE-API-TOKEN>" \
|
|
82
|
-
--temp 0.1
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
# Embed your LLM
|
|
86
|
-
|
|
87
|
-
All you have to do is to implement `BaseLM` class, that includes:
|
|
88
|
-
* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
|
|
89
|
-
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
90
|
-
|
|
91
|
-
See examples with models [here](/ext).
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from bulk_chain.core.llm_base import BaseLM
|
|
4
|
-
from bulk_chain.core.service_data import DataService
|
|
5
|
-
from bulk_chain.core.utils import iter_params
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def pad_str(text, pad):
|
|
9
|
-
return text.rjust(len(text) + pad, ' ')
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def text_wrap(content, width, handle_line=lambda l: l):
|
|
13
|
-
lines = []
|
|
14
|
-
for text in content.split('\n'):
|
|
15
|
-
for i in range(0, len(text), width):
|
|
16
|
-
line = handle_line(text[i:i + width])
|
|
17
|
-
lines.append(line)
|
|
18
|
-
return '\n'.join(lines)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def nice_output(text, width, pad=4, remove_new_line=False):
|
|
22
|
-
short_text = text.replace("\n", "") if remove_new_line else text
|
|
23
|
-
return text_wrap(content=short_text, width=width, handle_line=lambda line: pad_str(line, pad=pad))
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def chat_with_lm(lm, chain=None, model_name=None):
|
|
27
|
-
assert (isinstance(lm, BaseLM))
|
|
28
|
-
assert (isinstance(chain, list))
|
|
29
|
-
assert (isinstance(model_name, str) or model_name is None)
|
|
30
|
-
|
|
31
|
-
logger = logging.getLogger(__name__)
|
|
32
|
-
logging.basicConfig(level=logging.INFO)
|
|
33
|
-
|
|
34
|
-
do_exit = False
|
|
35
|
-
model_name = model_name if model_name is not None else "agent"
|
|
36
|
-
|
|
37
|
-
while not do_exit:
|
|
38
|
-
|
|
39
|
-
logger.info("----------------")
|
|
40
|
-
|
|
41
|
-
# Launching the CoT engine loop.
|
|
42
|
-
data_dict = {}
|
|
43
|
-
for prompt_args in chain:
|
|
44
|
-
|
|
45
|
-
# Processing the prompt.
|
|
46
|
-
prompt = prompt_args["prompt"]
|
|
47
|
-
|
|
48
|
-
# Filling necessary parameters.
|
|
49
|
-
field_names = list(iter_params(prompt))
|
|
50
|
-
for ind, f_name in enumerate(field_names):
|
|
51
|
-
|
|
52
|
-
if f_name in data_dict:
|
|
53
|
-
continue
|
|
54
|
-
|
|
55
|
-
user_input = input(f"Enter your prompt for `{f_name}` ({ind+1}/{len(field_names)}) "
|
|
56
|
-
f"(or 'exit' to quit): ")
|
|
57
|
-
|
|
58
|
-
if user_input.lower() == 'exit':
|
|
59
|
-
do_exit = True
|
|
60
|
-
break
|
|
61
|
-
|
|
62
|
-
data_dict[f_name] = user_input
|
|
63
|
-
|
|
64
|
-
if do_exit:
|
|
65
|
-
break
|
|
66
|
-
|
|
67
|
-
# Finally asking LLM.
|
|
68
|
-
DataService.compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
|
|
69
|
-
actual_prompt = DataService.get_prompt_text(prompt=prompt, data_dict=data_dict)
|
|
70
|
-
|
|
71
|
-
# Returning meta information, passed to LLM.
|
|
72
|
-
pad = 4
|
|
73
|
-
logger.info(pad_str(f"{model_name} (ask) ->", pad=pad))
|
|
74
|
-
logger.info(nice_output(actual_prompt, pad=pad*2, remove_new_line=True, width=80))
|
|
75
|
-
|
|
76
|
-
# Response.
|
|
77
|
-
response_batch = lm.ask_core(batch=[actual_prompt])
|
|
78
|
-
logger.info(pad_str(f"{model_name} (resp)->", pad=pad))
|
|
79
|
-
logger.info(nice_output(response_batch[0], pad=pad * 2, remove_new_line=False, width=80))
|
|
80
|
-
|
|
81
|
-
# Collecting the answer for the next turn.
|
|
82
|
-
data_dict[prompt_args["out"]] = response_batch[0]
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
|
|
3
|
-
from bulk_chain.core.service_args import CmdArgsService
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
# Csv-related.
|
|
7
|
-
csv_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%csv", end_prefix="%%")
|
|
8
|
-
print(csv_args)
|
|
9
|
-
csv_args = CmdArgsService.args_to_dict(csv_args)
|
|
10
|
-
print("csv\t", csv_args)
|
|
11
|
-
|
|
12
|
-
# Model-related.
|
|
13
|
-
m_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%m", end_prefix="%%")
|
|
14
|
-
m_args = CmdArgsService.args_to_dict(m_args)
|
|
15
|
-
print("mod\t", m_args)
|
|
16
|
-
|
|
17
|
-
# native.
|
|
18
|
-
n_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
|
|
19
|
-
n_args = CmdArgsService.args_to_dict(n_args)
|
|
20
|
-
print("nat\t", n_args)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|