bulk-chain 0.25.2__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/PKG-INFO +4 -52
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/README.md +3 -50
- bulk_chain-1.0.0/bulk_chain/api.py +143 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/llm_base.py +1 -3
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_batch.py +4 -21
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_data.py +9 -5
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/utils.py +15 -25
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/PKG-INFO +4 -52
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/SOURCES.txt +1 -6
- bulk_chain-1.0.0/bulk_chain.egg-info/requires.txt +1 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/setup.py +1 -1
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test_api.py +7 -16
- bulk_chain-1.0.0/test/test_api_streaming.py +52 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test_provider_batching.py +4 -3
- bulk_chain-0.25.2/bulk_chain/api.py +0 -99
- bulk_chain-0.25.2/bulk_chain/core/service_args.py +0 -72
- bulk_chain-0.25.2/bulk_chain/core/service_llm.py +0 -94
- bulk_chain-0.25.2/bulk_chain/core/utils_logger.py +0 -41
- bulk_chain-0.25.2/bulk_chain/demo.py +0 -84
- bulk_chain-0.25.2/bulk_chain/infer.py +0 -161
- bulk_chain-0.25.2/bulk_chain.egg-info/requires.txt +0 -2
- bulk_chain-0.25.2/test/test_cmdargs.py +0 -29
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/LICENSE +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/__init__.py +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/__init__.py +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_dict.py +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_json.py +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_schema.py +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/dependency_links.txt +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/top_level.txt +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/setup.cfg +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test.py +0 -0
- {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test_args_seeking.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bulk_chain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
5
|
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -16,9 +16,8 @@ Requires-Python: >=3.6
|
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: tqdm
|
|
19
|
-
Requires-Dist: source-iter==0.24.3
|
|
20
19
|
|
|
21
|
-
# bulk-chain 0.
|
|
20
|
+
# bulk-chain 1.0.0
|
|
22
21
|

|
|
23
22
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
24
23
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -31,7 +30,7 @@ Requires-Dist: source-iter==0.24.3
|
|
|
31
30
|
<p align="center">
|
|
32
31
|
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
33
32
|
<br>
|
|
34
|
-
<a href="https://github.com/nicolay-r/bulk-chain
|
|
33
|
+
<a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
|
|
35
34
|
</p>
|
|
36
35
|
|
|
37
36
|
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
@@ -39,11 +38,7 @@ A no-strings-attached **framework** for your LLM that allows applying Chain-of-
|
|
|
39
38
|
### Main Features
|
|
40
39
|
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
41
40
|
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
42
|
-
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
43
|
-
|
|
44
|
-
### Extra Features
|
|
45
|
-
* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
|
|
46
|
-
|
|
41
|
+
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
47
42
|
|
|
48
43
|
# Installation
|
|
49
44
|
|
|
@@ -88,51 +83,8 @@ Preliminary steps:
|
|
|
88
83
|
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
89
84
|
2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
90
85
|
|
|
91
|
-
## Shell
|
|
92
|
-
|
|
93
|
-
### Demo Mode
|
|
94
|
-
|
|
95
|
-
**demo mode** to interact with LLM via command line with LLM output streaming support.
|
|
96
|
-
The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
|
|
97
|
-
|
|
98
|
-
Quck start with launching demo:
|
|
99
|
-
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
100
|
-
2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
|
|
101
|
-
3. 🚀 Launch `demo.py` as follows:
|
|
102
|
-
```bash
|
|
103
|
-
python3 -m bulk_chain.demo \
|
|
104
|
-
--schema "test/schema/thor_cot_schema.json" \
|
|
105
|
-
--adapter "dynamic:replicate_104.py:Replicate" \
|
|
106
|
-
%%m \
|
|
107
|
-
--model_name "meta/meta-llama-3-70b-instruct" \
|
|
108
|
-
--api_token "<REPLICATE-API-TOKEN>" \
|
|
109
|
-
--stream
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
|
|
113
|
-

|
|
114
86
|
|
|
115
87
|
|
|
116
|
-
### Inference Mode
|
|
117
|
-
|
|
118
|
-
> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
|
|
119
|
-
|
|
120
|
-
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
121
|
-
```bash
|
|
122
|
-
wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
|
|
123
|
-
```
|
|
124
|
-
2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
|
|
125
|
-
3. 🚀 Launch inference using `DeepSeek-R1`:
|
|
126
|
-
```bash
|
|
127
|
-
python3 -m bulk_chain.infer \
|
|
128
|
-
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
129
|
-
--schema "test/schema/default.json" \
|
|
130
|
-
--adapter "replicate_104.py:Replicate" \
|
|
131
|
-
%%m \
|
|
132
|
-
--model_name "deepseek-ai/deepseek-r1" \
|
|
133
|
-
--api_token "<REPLICATE-API-TOKEN>"
|
|
134
|
-
```
|
|
135
|
-
|
|
136
88
|
## API
|
|
137
89
|
|
|
138
90
|
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# bulk-chain 0.
|
|
1
|
+
# bulk-chain 1.0.0
|
|
2
2
|

|
|
3
3
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
4
4
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
<p align="center">
|
|
12
12
|
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
13
13
|
<br>
|
|
14
|
-
<a href="https://github.com/nicolay-r/bulk-chain
|
|
14
|
+
<a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
|
|
15
15
|
</p>
|
|
16
16
|
|
|
17
17
|
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
@@ -19,11 +19,7 @@ A no-strings-attached **framework** for your LLM that allows applying Chain-of-
|
|
|
19
19
|
### Main Features
|
|
20
20
|
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
21
21
|
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
22
|
-
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
23
|
-
|
|
24
|
-
### Extra Features
|
|
25
|
-
* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
|
|
26
|
-
|
|
22
|
+
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
27
23
|
|
|
28
24
|
# Installation
|
|
29
25
|
|
|
@@ -68,51 +64,8 @@ Preliminary steps:
|
|
|
68
64
|
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
69
65
|
2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
70
66
|
|
|
71
|
-
## Shell
|
|
72
|
-
|
|
73
|
-
### Demo Mode
|
|
74
|
-
|
|
75
|
-
**demo mode** to interact with LLM via command line with LLM output streaming support.
|
|
76
|
-
The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
|
|
77
|
-
|
|
78
|
-
Quck start with launching demo:
|
|
79
|
-
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
80
|
-
2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
|
|
81
|
-
3. 🚀 Launch `demo.py` as follows:
|
|
82
|
-
```bash
|
|
83
|
-
python3 -m bulk_chain.demo \
|
|
84
|
-
--schema "test/schema/thor_cot_schema.json" \
|
|
85
|
-
--adapter "dynamic:replicate_104.py:Replicate" \
|
|
86
|
-
%%m \
|
|
87
|
-
--model_name "meta/meta-llama-3-70b-instruct" \
|
|
88
|
-
--api_token "<REPLICATE-API-TOKEN>" \
|
|
89
|
-
--stream
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
|
|
93
|
-

|
|
94
67
|
|
|
95
68
|
|
|
96
|
-
### Inference Mode
|
|
97
|
-
|
|
98
|
-
> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
|
|
99
|
-
|
|
100
|
-
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
101
|
-
```bash
|
|
102
|
-
wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
|
|
103
|
-
```
|
|
104
|
-
2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
|
|
105
|
-
3. 🚀 Launch inference using `DeepSeek-R1`:
|
|
106
|
-
```bash
|
|
107
|
-
python3 -m bulk_chain.infer \
|
|
108
|
-
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
109
|
-
--schema "test/schema/default.json" \
|
|
110
|
-
--adapter "replicate_104.py:Replicate" \
|
|
111
|
-
%%m \
|
|
112
|
-
--model_name "deepseek-ai/deepseek-r1" \
|
|
113
|
-
--api_token "<REPLICATE-API-TOKEN>"
|
|
114
|
-
```
|
|
115
|
-
|
|
116
69
|
## API
|
|
117
70
|
|
|
118
71
|
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import os
|
|
3
|
+
from itertools import chain
|
|
4
|
+
|
|
5
|
+
from bulk_chain.core.llm_base import BaseLM
|
|
6
|
+
from bulk_chain.core.service_batch import BatchIterator
|
|
7
|
+
from bulk_chain.core.service_data import DataService
|
|
8
|
+
from bulk_chain.core.service_dict import DictionaryService
|
|
9
|
+
from bulk_chain.core.service_json import JsonService
|
|
10
|
+
from bulk_chain.core.service_schema import SchemaService
|
|
11
|
+
from bulk_chain.core.utils import dynamic_init, find_by_prefix
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
INFER_MODES = {
|
|
15
|
+
"batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
|
|
16
|
+
DataService.limit_prompts(batch, limit=limit_prompt))
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
CWD = os.getcwd()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _iter_entry_content(entry, entry_info=None, **kwargs):
|
|
24
|
+
|
|
25
|
+
if isinstance(entry, str):
|
|
26
|
+
kwargs.get("callback_str_func", lambda *_: None)(entry, entry_info)
|
|
27
|
+
yield entry
|
|
28
|
+
elif isinstance(entry, collections.abc.Iterable):
|
|
29
|
+
h = kwargs.get("callback_stream_func", lambda *_: None)
|
|
30
|
+
h(None, entry_info | {"action": "start"})
|
|
31
|
+
for chunk in map(lambda item: str(item), entry):
|
|
32
|
+
yield chunk
|
|
33
|
+
h(chunk, entry_info)
|
|
34
|
+
h(None, entry_info | {"action": "end"})
|
|
35
|
+
else:
|
|
36
|
+
raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _iter_batch_prompts(c, batch_content_it, **kwargs):
|
|
40
|
+
for ind_in_batch, entry in enumerate(batch_content_it):
|
|
41
|
+
content = DataService.get_prompt_text(
|
|
42
|
+
prompt=entry[c]["prompt"],
|
|
43
|
+
data_dict=entry,
|
|
44
|
+
handle_missed_func=kwargs["handle_missed_value_func"])
|
|
45
|
+
yield ind_in_batch, content
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _iter_batch_responses(p_column, c, batch_content_it, **kwargs):
|
|
49
|
+
p_batch = [item[p_column] for item in batch_content_it]
|
|
50
|
+
# TODO. This part could be async.
|
|
51
|
+
# TODO. ind_in_batch might be a part of the async return.
|
|
52
|
+
for ind_in_batch, entry in enumerate(kwargs["handle_batch_func"](p_batch)):
|
|
53
|
+
yield ind_in_batch, _iter_entry_content(entry=entry, entry_info={"ind": ind_in_batch, "param": c}, **kwargs)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _infer_batch(batch, schema, return_mode, cols=None, **kwargs):
|
|
57
|
+
assert (isinstance(batch, list))
|
|
58
|
+
|
|
59
|
+
if len(batch) == 0:
|
|
60
|
+
return batch
|
|
61
|
+
|
|
62
|
+
if cols is None:
|
|
63
|
+
first_item = batch[0]
|
|
64
|
+
cols = list(first_item.keys()) if cols is None else cols
|
|
65
|
+
|
|
66
|
+
for c in cols:
|
|
67
|
+
|
|
68
|
+
# Handling prompt column.
|
|
69
|
+
if c in schema.p2r:
|
|
70
|
+
content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
|
|
71
|
+
for ind_in_batch, prompt in content_it:
|
|
72
|
+
batch[ind_in_batch][c] = prompt
|
|
73
|
+
|
|
74
|
+
# Handling column for inference.
|
|
75
|
+
if c in schema.r2p:
|
|
76
|
+
content_it = _iter_batch_responses(c=c, p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
|
|
77
|
+
for ind_in_batch, chunk_it in content_it:
|
|
78
|
+
|
|
79
|
+
chunks = []
|
|
80
|
+
for chunk in chunk_it:
|
|
81
|
+
chunks.append(chunk)
|
|
82
|
+
|
|
83
|
+
if return_mode == "chunk":
|
|
84
|
+
yield [ind_in_batch, c, chunk]
|
|
85
|
+
|
|
86
|
+
batch[ind_in_batch][c] = "".join(chunks)
|
|
87
|
+
|
|
88
|
+
if return_mode == "record":
|
|
89
|
+
for record in batch:
|
|
90
|
+
yield record
|
|
91
|
+
|
|
92
|
+
if return_mode == "batch":
|
|
93
|
+
yield batch
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None, return_mode="batch", **kwargs):
|
|
97
|
+
""" This method represent Python API aimed at application of `llm` towards
|
|
98
|
+
iterator of input_dicts via cache_target that refers to the SQLite using
|
|
99
|
+
the given `schema`
|
|
100
|
+
"""
|
|
101
|
+
assert (return_mode in ["batch", "chunk"])
|
|
102
|
+
assert (isinstance(llm, BaseLM))
|
|
103
|
+
|
|
104
|
+
# Quick initialization of the schema.
|
|
105
|
+
if isinstance(schema, str):
|
|
106
|
+
schema = JsonService.read(schema)
|
|
107
|
+
if isinstance(schema, dict):
|
|
108
|
+
schema = SchemaService(json_data=schema)
|
|
109
|
+
|
|
110
|
+
prompts_it = map(
|
|
111
|
+
lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
|
|
112
|
+
input_dicts_it
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
content_it = (_infer_batch(batch=batch,
|
|
116
|
+
handle_batch_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
|
|
117
|
+
return_mode=return_mode,
|
|
118
|
+
schema=schema,
|
|
119
|
+
**kwargs)
|
|
120
|
+
for batch in BatchIterator(prompts_it, batch_size=batch_size))
|
|
121
|
+
|
|
122
|
+
yield from chain.from_iterable(content_it)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def init_llm(adapter, **model_kwargs):
|
|
126
|
+
""" This method perform dynamic initialization of LLM from third-party resource.
|
|
127
|
+
"""
|
|
128
|
+
assert (isinstance(adapter, str))
|
|
129
|
+
|
|
130
|
+
# List of the Supported models and their API wrappers.
|
|
131
|
+
models_preset = {
|
|
132
|
+
"dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
|
|
133
|
+
class_name=llm_model_params)(**model_kwargs)
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Initialize LLM model.
|
|
137
|
+
params = adapter.split(':')
|
|
138
|
+
llm_model_type = params[0]
|
|
139
|
+
llm_model_name = params[1] if len(params) > 1 else params[-1]
|
|
140
|
+
llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
|
|
141
|
+
llm = find_by_prefix(d=models_preset, key=llm_model_type)()
|
|
142
|
+
|
|
143
|
+
return llm, llm_model_name
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
|
|
4
|
-
from bulk_chain.core.utils import format_model_name
|
|
5
|
-
|
|
6
4
|
|
|
7
5
|
class BaseLM(object):
|
|
8
6
|
|
|
@@ -49,4 +47,4 @@ class BaseLM(object):
|
|
|
49
47
|
raise NotImplemented()
|
|
50
48
|
|
|
51
49
|
def name(self):
|
|
52
|
-
return
|
|
50
|
+
return self.__name.replace("/", "_")
|
|
@@ -1,31 +1,13 @@
|
|
|
1
|
-
class BatchService(object):
|
|
2
|
-
|
|
3
|
-
@staticmethod
|
|
4
|
-
def handle_param_as_batch(batch, src_param, tgt_param, handle_func):
|
|
5
|
-
assert (isinstance(batch, list))
|
|
6
|
-
assert (isinstance(src_param, str))
|
|
7
|
-
assert (callable(handle_func))
|
|
8
|
-
|
|
9
|
-
_batch = [item[src_param] for item in batch]
|
|
10
|
-
|
|
11
|
-
# Do handling for the batch.
|
|
12
|
-
_handled_batch = handle_func(_batch)
|
|
13
|
-
assert (isinstance(_handled_batch, list))
|
|
14
|
-
|
|
15
|
-
# Apply changes.
|
|
16
|
-
for i, item in enumerate(batch):
|
|
17
|
-
item[tgt_param] = _handled_batch[i]
|
|
18
|
-
|
|
19
|
-
|
|
20
1
|
class BatchIterator:
|
|
21
2
|
|
|
22
|
-
def __init__(self, data_iter, batch_size, end_value=None):
|
|
3
|
+
def __init__(self, data_iter, batch_size, end_value=None, filter_func=None):
|
|
23
4
|
assert(isinstance(batch_size, int) and batch_size > 0)
|
|
24
5
|
assert(callable(end_value) or end_value is None)
|
|
25
6
|
self.__data_iter = data_iter
|
|
26
7
|
self.__index = 0
|
|
27
8
|
self.__batch_size = batch_size
|
|
28
9
|
self.__end_value = end_value
|
|
10
|
+
self.__filter_func = (lambda _: True) if filter_func is None else filter_func
|
|
29
11
|
|
|
30
12
|
def __iter__(self):
|
|
31
13
|
return self
|
|
@@ -37,7 +19,8 @@ class BatchIterator:
|
|
|
37
19
|
data = next(self.__data_iter)
|
|
38
20
|
except StopIteration:
|
|
39
21
|
break
|
|
40
|
-
|
|
22
|
+
if self.__filter_func(data):
|
|
23
|
+
buffer.append(data)
|
|
41
24
|
if len(buffer) == self.__batch_size:
|
|
42
25
|
break
|
|
43
26
|
|
|
@@ -4,8 +4,8 @@ from bulk_chain.core.utils import iter_params
|
|
|
4
4
|
class DataService(object):
|
|
5
5
|
|
|
6
6
|
@staticmethod
|
|
7
|
-
def
|
|
8
|
-
assert(isinstance(data_dict, dict))
|
|
7
|
+
def __compose_prompt_text(prompt, data_dict, field_names):
|
|
8
|
+
assert (isinstance(data_dict, dict))
|
|
9
9
|
fmt_d = {col_name: data_dict[col_name] for col_name in field_names}
|
|
10
10
|
|
|
11
11
|
# Guarantee that items has correct type.
|
|
@@ -16,10 +16,14 @@ class DataService(object):
|
|
|
16
16
|
return prompt.format(**fmt_d)
|
|
17
17
|
|
|
18
18
|
@staticmethod
|
|
19
|
-
def get_prompt_text(prompt, data_dict, parse_fields_func=iter_params):
|
|
19
|
+
def get_prompt_text(prompt, data_dict, parse_fields_func=iter_params, handle_missed_func=None):
|
|
20
20
|
field_names = list(parse_fields_func(prompt))
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
|
|
22
|
+
for col_name in field_names:
|
|
23
|
+
if col_name not in data_dict:
|
|
24
|
+
data_dict[col_name] = handle_missed_func(col_name)
|
|
25
|
+
|
|
26
|
+
return DataService.__compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
|
|
23
27
|
|
|
24
28
|
@staticmethod
|
|
25
29
|
def limit_prompts(prompts_list, limit=None):
|
|
@@ -2,6 +2,7 @@ import importlib
|
|
|
2
2
|
import logging
|
|
3
3
|
import sys
|
|
4
4
|
from collections import Counter
|
|
5
|
+
from os.path import dirname, join, basename
|
|
5
6
|
|
|
6
7
|
logger = logging.getLogger(__name__)
|
|
7
8
|
logging.basicConfig(level=logging.INFO)
|
|
@@ -47,28 +48,6 @@ def iter_params(text):
|
|
|
47
48
|
beg = pe+1
|
|
48
49
|
|
|
49
50
|
|
|
50
|
-
def format_model_name(name):
|
|
51
|
-
return name.replace("/", "_")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def parse_filepath(filepath, default_filepath=None, default_ext=None):
|
|
55
|
-
""" This is an auxiliary function for handling sources and targets from cmd string.
|
|
56
|
-
"""
|
|
57
|
-
if filepath is None:
|
|
58
|
-
return default_filepath, default_ext, None
|
|
59
|
-
info = filepath.split(":")
|
|
60
|
-
filepath = info[0]
|
|
61
|
-
meta = info[1] if len(info) > 1 else None
|
|
62
|
-
ext = filepath.split('.')[-1] if default_ext is None else default_ext
|
|
63
|
-
return filepath, ext, meta
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def handle_table_name(name):
|
|
67
|
-
return name.\
|
|
68
|
-
replace('-', '_').\
|
|
69
|
-
replace('.', "_")
|
|
70
|
-
|
|
71
|
-
|
|
72
51
|
def auto_import(name, is_class=False):
|
|
73
52
|
""" Import from the external python packages.
|
|
74
53
|
"""
|
|
@@ -82,13 +61,24 @@ def auto_import(name, is_class=False):
|
|
|
82
61
|
|
|
83
62
|
|
|
84
63
|
def dynamic_init(class_dir, class_filepath, class_name=None):
|
|
85
|
-
|
|
64
|
+
|
|
65
|
+
# Registering path.
|
|
66
|
+
target = join(class_dir, dirname(class_filepath))
|
|
67
|
+
logger.info(f"Adding sys path for `{target}`")
|
|
68
|
+
sys.path.insert(1, target)
|
|
86
69
|
class_path_list = class_filepath.split('/')
|
|
87
|
-
|
|
70
|
+
|
|
71
|
+
# Composing proper class name.
|
|
72
|
+
class_filename = basename(class_path_list[-1])
|
|
73
|
+
if class_filename.endswith(".py"):
|
|
74
|
+
class_filename = class_filename[:-len(".py")]
|
|
75
|
+
|
|
76
|
+
# Loading library.
|
|
88
77
|
class_name = class_path_list[-1].title() if class_name is None else class_name
|
|
89
|
-
class_path = ".".join(
|
|
78
|
+
class_path = ".".join([class_filename, class_name])
|
|
90
79
|
logger.info(f"Dynamic loading for the file and class `{class_path}`")
|
|
91
80
|
cls = auto_import(class_path, is_class=False)
|
|
81
|
+
|
|
92
82
|
return cls
|
|
93
83
|
|
|
94
84
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bulk_chain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
5
|
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -16,9 +16,8 @@ Requires-Python: >=3.6
|
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: tqdm
|
|
19
|
-
Requires-Dist: source-iter==0.24.3
|
|
20
19
|
|
|
21
|
-
# bulk-chain 0.
|
|
20
|
+
# bulk-chain 1.0.0
|
|
22
21
|

|
|
23
22
|
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
24
23
|
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
@@ -31,7 +30,7 @@ Requires-Dist: source-iter==0.24.3
|
|
|
31
30
|
<p align="center">
|
|
32
31
|
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
33
32
|
<br>
|
|
34
|
-
<a href="https://github.com/nicolay-r/bulk-chain
|
|
33
|
+
<a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
|
|
35
34
|
</p>
|
|
36
35
|
|
|
37
36
|
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
@@ -39,11 +38,7 @@ A no-strings-attached **framework** for your LLM that allows applying Chain-of-
|
|
|
39
38
|
### Main Features
|
|
40
39
|
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
41
40
|
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
42
|
-
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
43
|
-
|
|
44
|
-
### Extra Features
|
|
45
|
-
* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
|
|
46
|
-
|
|
41
|
+
* ✅ **Provides iterator over infinite amount of input contexts**
|
|
47
42
|
|
|
48
43
|
# Installation
|
|
49
44
|
|
|
@@ -88,51 +83,8 @@ Preliminary steps:
|
|
|
88
83
|
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
89
84
|
2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
90
85
|
|
|
91
|
-
## Shell
|
|
92
|
-
|
|
93
|
-
### Demo Mode
|
|
94
|
-
|
|
95
|
-
**demo mode** to interact with LLM via command line with LLM output streaming support.
|
|
96
|
-
The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
|
|
97
|
-
|
|
98
|
-
Quck start with launching demo:
|
|
99
|
-
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
100
|
-
2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
|
|
101
|
-
3. 🚀 Launch `demo.py` as follows:
|
|
102
|
-
```bash
|
|
103
|
-
python3 -m bulk_chain.demo \
|
|
104
|
-
--schema "test/schema/thor_cot_schema.json" \
|
|
105
|
-
--adapter "dynamic:replicate_104.py:Replicate" \
|
|
106
|
-
%%m \
|
|
107
|
-
--model_name "meta/meta-llama-3-70b-instruct" \
|
|
108
|
-
--api_token "<REPLICATE-API-TOKEN>" \
|
|
109
|
-
--stream
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
|
|
113
|
-

|
|
114
86
|
|
|
115
87
|
|
|
116
|
-
### Inference Mode
|
|
117
|
-
|
|
118
|
-
> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
|
|
119
|
-
|
|
120
|
-
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
121
|
-
```bash
|
|
122
|
-
wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
|
|
123
|
-
```
|
|
124
|
-
2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
|
|
125
|
-
3. 🚀 Launch inference using `DeepSeek-R1`:
|
|
126
|
-
```bash
|
|
127
|
-
python3 -m bulk_chain.infer \
|
|
128
|
-
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
129
|
-
--schema "test/schema/default.json" \
|
|
130
|
-
--adapter "replicate_104.py:Replicate" \
|
|
131
|
-
%%m \
|
|
132
|
-
--model_name "deepseek-ai/deepseek-r1" \
|
|
133
|
-
--api_token "<REPLICATE-API-TOKEN>"
|
|
134
|
-
```
|
|
135
|
-
|
|
136
88
|
## API
|
|
137
89
|
|
|
138
90
|
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
@@ -3,8 +3,6 @@ README.md
|
|
|
3
3
|
setup.py
|
|
4
4
|
bulk_chain/__init__.py
|
|
5
5
|
bulk_chain/api.py
|
|
6
|
-
bulk_chain/demo.py
|
|
7
|
-
bulk_chain/infer.py
|
|
8
6
|
bulk_chain.egg-info/PKG-INFO
|
|
9
7
|
bulk_chain.egg-info/SOURCES.txt
|
|
10
8
|
bulk_chain.egg-info/dependency_links.txt
|
|
@@ -12,17 +10,14 @@ bulk_chain.egg-info/requires.txt
|
|
|
12
10
|
bulk_chain.egg-info/top_level.txt
|
|
13
11
|
bulk_chain/core/__init__.py
|
|
14
12
|
bulk_chain/core/llm_base.py
|
|
15
|
-
bulk_chain/core/service_args.py
|
|
16
13
|
bulk_chain/core/service_batch.py
|
|
17
14
|
bulk_chain/core/service_data.py
|
|
18
15
|
bulk_chain/core/service_dict.py
|
|
19
16
|
bulk_chain/core/service_json.py
|
|
20
|
-
bulk_chain/core/service_llm.py
|
|
21
17
|
bulk_chain/core/service_schema.py
|
|
22
18
|
bulk_chain/core/utils.py
|
|
23
|
-
bulk_chain/core/utils_logger.py
|
|
24
19
|
test/test.py
|
|
25
20
|
test/test_api.py
|
|
21
|
+
test/test_api_streaming.py
|
|
26
22
|
test/test_args_seeking.py
|
|
27
|
-
test/test_cmdargs.py
|
|
28
23
|
test/test_provider_batching.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tqdm
|
|
@@ -15,7 +15,7 @@ def get_requirements(filenames):
|
|
|
15
15
|
|
|
16
16
|
setup(
|
|
17
17
|
name='bulk_chain',
|
|
18
|
-
version='0.
|
|
18
|
+
version='1.0.0',
|
|
19
19
|
python_requires=">=3.6",
|
|
20
20
|
description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
|
|
21
21
|
'ensuring reliable results for bulk input requests.',
|
|
@@ -3,37 +3,28 @@ from os.path import join
|
|
|
3
3
|
|
|
4
4
|
from bulk_chain.api import iter_content, CWD
|
|
5
5
|
from bulk_chain.core.utils import dynamic_init
|
|
6
|
-
from
|
|
6
|
+
from utils import current_dir, API_TOKEN
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class TestAPI(unittest.TestCase):
|
|
10
10
|
|
|
11
11
|
llm = dynamic_init(class_dir=join(CWD, ".."),
|
|
12
12
|
class_filepath="providers/replicate_104.py",
|
|
13
|
-
class_name="Replicate")(api_token=
|
|
13
|
+
class_name="Replicate")(api_token=API_TOKEN,
|
|
14
14
|
model_name="deepseek-ai/deepseek-r1")
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
@staticmethod
|
|
17
|
+
def it_data(n):
|
|
17
18
|
for i in range(n):
|
|
18
19
|
yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
|
|
19
20
|
|
|
20
|
-
def test_iter_cached(self):
|
|
21
|
-
data_it = iter_content_cached(input_dicts_it=self.it_data(20),
|
|
22
|
-
llm=self.llm,
|
|
23
|
-
schema="../schema/default.json",
|
|
24
|
-
# Cache-related extra parameters.
|
|
25
|
-
cache_target="out.sqlite:content",
|
|
26
|
-
id_column_name="ind")
|
|
27
|
-
|
|
28
|
-
for data in data_it:
|
|
29
|
-
print(data)
|
|
30
|
-
|
|
31
21
|
def test_iter(self):
|
|
32
22
|
data_it = iter_content(input_dicts_it=self.it_data(20),
|
|
33
23
|
llm=self.llm,
|
|
34
24
|
batch_size=1,
|
|
35
|
-
|
|
36
|
-
|
|
25
|
+
handle_missed_value_func=lambda *_: None,
|
|
26
|
+
return_mode="batch",
|
|
27
|
+
schema=join(current_dir, "schema/default.json"))
|
|
37
28
|
|
|
38
29
|
for data in data_it:
|
|
39
30
|
print(data)
|