bulk-chain 0.24.2__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bulk_chain/api.py +99 -0
- bulk_chain/core/llm_base.py +23 -6
- bulk_chain/core/service_args.py +31 -7
- bulk_chain/core/service_batch.py +51 -0
- bulk_chain/core/service_data.py +4 -0
- bulk_chain/core/service_dict.py +10 -0
- bulk_chain/core/service_llm.py +36 -15
- bulk_chain/core/service_schema.py +1 -2
- bulk_chain/core/utils_logger.py +41 -0
- bulk_chain/demo.py +85 -0
- bulk_chain/infer.py +81 -95
- bulk_chain-0.25.1.dist-info/METADATA +131 -0
- bulk_chain-0.25.1.dist-info/RECORD +20 -0
- bulk_chain-0.24.2.dist-info/METADATA +0 -98
- bulk_chain-0.24.2.dist-info/RECORD +0 -15
- {bulk_chain-0.24.2.dist-info → bulk_chain-0.25.1.dist-info}/LICENSE +0 -0
- {bulk_chain-0.24.2.dist-info → bulk_chain-0.25.1.dist-info}/WHEEL +0 -0
- {bulk_chain-0.24.2.dist-info → bulk_chain-0.25.1.dist-info}/top_level.txt +0 -0
bulk_chain/api.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from itertools import chain
|
|
3
|
+
|
|
4
|
+
from bulk_chain.core.llm_base import BaseLM
|
|
5
|
+
from bulk_chain.core.service_batch import BatchIterator, BatchService
|
|
6
|
+
from bulk_chain.core.service_data import DataService
|
|
7
|
+
from bulk_chain.core.service_dict import DictionaryService
|
|
8
|
+
from bulk_chain.core.service_json import JsonService
|
|
9
|
+
from bulk_chain.core.service_schema import SchemaService
|
|
10
|
+
from bulk_chain.core.utils import dynamic_init, find_by_prefix
|
|
11
|
+
|
|
12
|
+
INFER_MODES = {
|
|
13
|
+
"default": lambda llm, prompt, limit_prompt=None: llm.ask_core(
|
|
14
|
+
prompt[:limit_prompt] if limit_prompt is not None else prompt),
|
|
15
|
+
"batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
|
|
16
|
+
DataService.limit_prompts(batch, limit=limit_prompt))
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
CWD = os.getcwd()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _update_batch_content(c, batch, schema, infer_func):
|
|
24
|
+
assert (isinstance(batch, list))
|
|
25
|
+
assert (isinstance(c, str))
|
|
26
|
+
|
|
27
|
+
if c in schema.p2r:
|
|
28
|
+
for batch_item in batch:
|
|
29
|
+
batch_item[c] = DataService.get_prompt_text(prompt=batch_item[c]["prompt"], data_dict=batch_item)
|
|
30
|
+
if c in schema.r2p:
|
|
31
|
+
p_column = schema.r2p[c]
|
|
32
|
+
# This instruction takes a lot of time in a non-batching mode.
|
|
33
|
+
BatchService.handle_param_as_batch(batch=batch,
|
|
34
|
+
src_param=p_column,
|
|
35
|
+
tgt_param=c,
|
|
36
|
+
handle_func=lambda b: infer_func(b))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _infer_batch(batch, schema, infer_func, cols=None):
|
|
40
|
+
assert (isinstance(batch, list))
|
|
41
|
+
assert (callable(infer_func))
|
|
42
|
+
|
|
43
|
+
if len(batch) == 0:
|
|
44
|
+
return batch
|
|
45
|
+
|
|
46
|
+
if cols is None:
|
|
47
|
+
first_item = batch[0]
|
|
48
|
+
cols = first_item.keys() if cols is None else cols
|
|
49
|
+
|
|
50
|
+
for c in cols:
|
|
51
|
+
_update_batch_content(c=c, batch=batch, schema=schema, infer_func=infer_func)
|
|
52
|
+
|
|
53
|
+
return batch
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, limit_prompt=None):
|
|
57
|
+
""" This method represent Python API aimed at application of `llm` towards
|
|
58
|
+
iterator of input_dicts via cache_target that refers to the SQLite using
|
|
59
|
+
the given `schema`
|
|
60
|
+
"""
|
|
61
|
+
assert (isinstance(llm, BaseLM))
|
|
62
|
+
|
|
63
|
+
# Quick initialization of the schema.
|
|
64
|
+
if isinstance(schema, str):
|
|
65
|
+
schema = JsonService.read(schema)
|
|
66
|
+
if isinstance(schema, dict):
|
|
67
|
+
schema = SchemaService(json_data=schema)
|
|
68
|
+
|
|
69
|
+
prompts_it = map(
|
|
70
|
+
lambda data: DictionaryService.custom_update(src_dict=data, other_dict=schema.cot_args),
|
|
71
|
+
input_dicts_it
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
content_it = (_infer_batch(batch=batch,
|
|
75
|
+
infer_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
|
|
76
|
+
schema=schema)
|
|
77
|
+
for batch in BatchIterator(prompts_it, batch_size=batch_size))
|
|
78
|
+
|
|
79
|
+
yield from content_it if return_batch else chain.from_iterable(content_it)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def init_llm(adapter, **model_kwargs):
|
|
83
|
+
""" This method perform dynamic initialization of LLM from third-party resource.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
# List of the Supported models and their API wrappers.
|
|
87
|
+
models_preset = {
|
|
88
|
+
"dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
|
|
89
|
+
class_name=llm_model_params)(**model_kwargs)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Initialize LLM model.
|
|
93
|
+
params = adapter.split(':')
|
|
94
|
+
llm_model_type = params[0]
|
|
95
|
+
llm_model_name = params[1] if len(params) > 1 else params[-1]
|
|
96
|
+
llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
|
|
97
|
+
llm = find_by_prefix(d=models_preset, key=llm_model_type)()
|
|
98
|
+
|
|
99
|
+
return llm, llm_model_name
|
bulk_chain/core/llm_base.py
CHANGED
|
@@ -6,29 +6,46 @@ from bulk_chain.core.utils import format_model_name
|
|
|
6
6
|
|
|
7
7
|
class BaseLM(object):
|
|
8
8
|
|
|
9
|
-
def __init__(self, name, attempts=None, delay_sec=1, enable_log=True,
|
|
9
|
+
def __init__(self, name=None, attempts=None, delay_sec=1, enable_log=True,
|
|
10
|
+
support_batching=False, **kwargs):
|
|
11
|
+
|
|
10
12
|
self.__name = name
|
|
11
13
|
self.__attempts = 1 if attempts is None else attempts
|
|
12
14
|
self.__delay_sec = delay_sec
|
|
15
|
+
self.__support_batching = support_batching
|
|
13
16
|
|
|
14
17
|
if enable_log:
|
|
15
18
|
self.__logger = logging.getLogger(__name__)
|
|
16
19
|
logging.basicConfig(level=logging.INFO)
|
|
17
20
|
|
|
18
|
-
def
|
|
21
|
+
def ask_core(self, batch):
|
|
19
22
|
|
|
20
23
|
for i in range(self.__attempts):
|
|
21
24
|
try:
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
+
if self.__support_batching:
|
|
26
|
+
# Launch in batch mode.
|
|
27
|
+
content = self.ask(batch)
|
|
28
|
+
else:
|
|
29
|
+
# Launch in non-batch mode.
|
|
30
|
+
assert len(batch) == 1, "The LM does not support batching," \
|
|
31
|
+
f" while size of the content is {len(batch)} which is not equal 1. " \
|
|
32
|
+
f"Please enable batch-supporting or set required inference settings."
|
|
33
|
+
content = batch[0]
|
|
34
|
+
|
|
35
|
+
response = self.ask(content)
|
|
36
|
+
|
|
37
|
+
# Wrapping into batch the response in the case of non-batching mode.
|
|
38
|
+
return response if self.__support_batching else [response]
|
|
39
|
+
|
|
40
|
+
except Exception as e:
|
|
25
41
|
if self.__logger is not None:
|
|
26
42
|
self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
|
|
43
|
+
self.__logger.info(e)
|
|
27
44
|
time.sleep(self.__delay_sec)
|
|
28
45
|
|
|
29
46
|
raise Exception("Can't infer")
|
|
30
47
|
|
|
31
|
-
def ask(self,
|
|
48
|
+
def ask(self, content):
|
|
32
49
|
raise NotImplemented()
|
|
33
50
|
|
|
34
51
|
def name(self):
|
bulk_chain/core/service_args.py
CHANGED
|
@@ -12,6 +12,11 @@ class CmdArgsService:
|
|
|
12
12
|
def iter_arguments(lst):
|
|
13
13
|
|
|
14
14
|
def __release():
|
|
15
|
+
|
|
16
|
+
# We use the True value by default to treat the related parameter as flag.
|
|
17
|
+
if len(buf) == 0:
|
|
18
|
+
buf.append(True)
|
|
19
|
+
|
|
15
20
|
return key, buf if len(buf) > 1 else buf[0]
|
|
16
21
|
|
|
17
22
|
key = None
|
|
@@ -29,18 +34,37 @@ class CmdArgsService:
|
|
|
29
34
|
buf.append(a)
|
|
30
35
|
|
|
31
36
|
# Sharing the remaining params.
|
|
32
|
-
if
|
|
37
|
+
if key is not None:
|
|
33
38
|
yield __release()
|
|
34
39
|
|
|
35
40
|
@staticmethod
|
|
36
|
-
def
|
|
41
|
+
def __find_suffix_ind(lst, idx_from, end_prefix):
|
|
42
|
+
for i in range(idx_from, len(lst)):
|
|
43
|
+
if lst[i].startswith(end_prefix):
|
|
44
|
+
return i
|
|
45
|
+
return len(lst)
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def extract_native_args(lst, end_prefix):
|
|
49
|
+
return lst[:CmdArgsService.__find_suffix_ind(lst, idx_from=0, end_prefix=end_prefix)]
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def find_grouped_args(lst, starts_with, end_prefix):
|
|
37
53
|
"""Slices a list in two, cutting on index matching "sep"
|
|
38
54
|
"""
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
return
|
|
55
|
+
|
|
56
|
+
# Checking the presence of starts_with.
|
|
57
|
+
# We have to return empty content in the case of absence starts_with in the lst.
|
|
58
|
+
if starts_with not in lst:
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
# Assigning start index.
|
|
62
|
+
idx_from = lst.index(starts_with) + 1
|
|
63
|
+
|
|
64
|
+
# Assigning end index.
|
|
65
|
+
idx_to = CmdArgsService.__find_suffix_ind(lst, idx_from=idx_from, end_prefix=end_prefix)
|
|
66
|
+
|
|
67
|
+
return lst[idx_from:idx_to]
|
|
44
68
|
|
|
45
69
|
@staticmethod
|
|
46
70
|
def args_to_dict(args):
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
class BatchService(object):
|
|
2
|
+
|
|
3
|
+
@staticmethod
|
|
4
|
+
def handle_param_as_batch(batch, src_param, tgt_param, handle_func):
|
|
5
|
+
assert (isinstance(batch, list))
|
|
6
|
+
assert (isinstance(src_param, str))
|
|
7
|
+
assert (callable(handle_func))
|
|
8
|
+
|
|
9
|
+
_batch = [item[src_param] for item in batch]
|
|
10
|
+
|
|
11
|
+
# Do handling for the batch.
|
|
12
|
+
_handled_batch = handle_func(_batch)
|
|
13
|
+
assert (isinstance(_handled_batch, list))
|
|
14
|
+
|
|
15
|
+
# Apply changes.
|
|
16
|
+
for i, item in enumerate(batch):
|
|
17
|
+
item[tgt_param] = _handled_batch[i]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BatchIterator:
|
|
21
|
+
|
|
22
|
+
def __init__(self, data_iter, batch_size, end_value=None):
|
|
23
|
+
assert(isinstance(batch_size, int) and batch_size > 0)
|
|
24
|
+
assert(callable(end_value) or end_value is None)
|
|
25
|
+
self.__data_iter = data_iter
|
|
26
|
+
self.__index = 0
|
|
27
|
+
self.__batch_size = batch_size
|
|
28
|
+
self.__end_value = end_value
|
|
29
|
+
|
|
30
|
+
def __iter__(self):
|
|
31
|
+
return self
|
|
32
|
+
|
|
33
|
+
def __next__(self):
|
|
34
|
+
buffer = []
|
|
35
|
+
while True:
|
|
36
|
+
try:
|
|
37
|
+
data = next(self.__data_iter)
|
|
38
|
+
except StopIteration:
|
|
39
|
+
break
|
|
40
|
+
buffer.append(data)
|
|
41
|
+
if len(buffer) == self.__batch_size:
|
|
42
|
+
break
|
|
43
|
+
|
|
44
|
+
if len(buffer) > 0:
|
|
45
|
+
self.__index += 1
|
|
46
|
+
return buffer
|
|
47
|
+
|
|
48
|
+
if self.__end_value is None:
|
|
49
|
+
raise StopIteration
|
|
50
|
+
else:
|
|
51
|
+
return self.__end_value()
|
bulk_chain/core/service_data.py
CHANGED
|
@@ -20,3 +20,7 @@ class DataService(object):
|
|
|
20
20
|
field_names = list(parse_fields_func(prompt))
|
|
21
21
|
return DataService.compose_prompt_text(
|
|
22
22
|
prompt=prompt, data_dict=data_dict, field_names=field_names)
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def limit_prompts(prompts_list, limit=None):
|
|
26
|
+
return [p[:limit] if limit is not None else p for p in prompts_list]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
class DictionaryService:
|
|
2
|
+
|
|
3
|
+
@staticmethod
|
|
4
|
+
def custom_update(src_dict, other_dict):
|
|
5
|
+
for k, v in other_dict.items():
|
|
6
|
+
if k in src_dict:
|
|
7
|
+
raise Exception(f"The key `{k}` is already defined in both dicts with values: "
|
|
8
|
+
f"`{src_dict[k]}` (src) and `{v}` (other)")
|
|
9
|
+
src_dict[k] = v
|
|
10
|
+
return src_dict
|
bulk_chain/core/service_llm.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
1
|
from bulk_chain.core.llm_base import BaseLM
|
|
4
2
|
from bulk_chain.core.service_data import DataService
|
|
5
3
|
from bulk_chain.core.utils import iter_params
|
|
4
|
+
from bulk_chain.core.utils_logger import StreamedLogger
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def pad_str(text, pad):
|
|
@@ -23,29 +22,32 @@ def nice_output(text, width, pad=4, remove_new_line=False):
|
|
|
23
22
|
return text_wrap(content=short_text, width=width, handle_line=lambda line: pad_str(line, pad=pad))
|
|
24
23
|
|
|
25
24
|
|
|
26
|
-
def chat_with_lm(lm, chain=None, model_name=None):
|
|
25
|
+
def chat_with_lm(lm, preset_dict=None, chain=None, model_name=None, line_width=80, pad=0):
|
|
27
26
|
assert (isinstance(lm, BaseLM))
|
|
28
27
|
assert (isinstance(chain, list))
|
|
29
28
|
assert (isinstance(model_name, str) or model_name is None)
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
|
|
30
|
+
preset_dict = {} if preset_dict is None else preset_dict
|
|
31
|
+
|
|
32
|
+
streamed_logger = StreamedLogger(__name__)
|
|
33
33
|
|
|
34
34
|
do_exit = False
|
|
35
35
|
model_name = model_name if model_name is not None else "agent"
|
|
36
36
|
|
|
37
37
|
while not do_exit:
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
streamed_logger.info("----------------")
|
|
40
|
+
streamed_logger.info("\n")
|
|
40
41
|
|
|
41
42
|
# Launching the CoT engine loop.
|
|
42
|
-
data_dict = {}
|
|
43
|
-
for prompt_args in chain:
|
|
43
|
+
data_dict = {} | preset_dict
|
|
44
|
+
for chain_ind, prompt_args in enumerate(chain):
|
|
44
45
|
|
|
45
46
|
# Processing the prompt.
|
|
46
47
|
prompt = prompt_args["prompt"]
|
|
47
48
|
|
|
48
49
|
# Filling necessary parameters.
|
|
50
|
+
user_informed = False
|
|
49
51
|
field_names = list(iter_params(prompt))
|
|
50
52
|
for ind, f_name in enumerate(field_names):
|
|
51
53
|
|
|
@@ -54,6 +56,7 @@ def chat_with_lm(lm, chain=None, model_name=None):
|
|
|
54
56
|
|
|
55
57
|
user_input = input(f"Enter your prompt for `{f_name}` ({ind+1}/{len(field_names)}) "
|
|
56
58
|
f"(or 'exit' to quit): ")
|
|
59
|
+
user_informed = True
|
|
57
60
|
|
|
58
61
|
if user_input.lower() == 'exit':
|
|
59
62
|
do_exit = True
|
|
@@ -64,19 +67,37 @@ def chat_with_lm(lm, chain=None, model_name=None):
|
|
|
64
67
|
if do_exit:
|
|
65
68
|
break
|
|
66
69
|
|
|
70
|
+
# In the case of the initial interaction with the chain.
|
|
71
|
+
# we make sure that aware user for starting interaction.
|
|
72
|
+
if chain_ind == 0 and not user_informed:
|
|
73
|
+
user_input = input(f"Enter to continue (or 'exit' to quit) ...")
|
|
74
|
+
if user_input.lower() == 'exit':
|
|
75
|
+
do_exit = True
|
|
76
|
+
|
|
67
77
|
# Finally asking LLM.
|
|
68
78
|
DataService.compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
|
|
69
79
|
actual_prompt = DataService.get_prompt_text(prompt=prompt, data_dict=data_dict)
|
|
70
80
|
|
|
71
81
|
# Returning meta information, passed to LLM.
|
|
72
|
-
pad
|
|
73
|
-
|
|
74
|
-
|
|
82
|
+
streamed_logger.info(pad_str(f"{model_name} (ask [{chain_ind+1}/{len(chain)}]) ->", pad=pad))
|
|
83
|
+
streamed_logger.info("\n")
|
|
84
|
+
streamed_logger.info(nice_output(actual_prompt, pad=pad, remove_new_line=True, width=line_width))
|
|
85
|
+
streamed_logger.info("\n\n")
|
|
75
86
|
|
|
76
87
|
# Response.
|
|
77
|
-
response = lm.
|
|
78
|
-
|
|
79
|
-
|
|
88
|
+
response = lm.ask_core(batch=[actual_prompt])[0]
|
|
89
|
+
streamed_logger.info(pad_str(f"{model_name} (resp [{chain_ind+1}/{len(chain)}])->", pad=pad))
|
|
90
|
+
streamed_logger.info("\n")
|
|
91
|
+
if isinstance(response, str):
|
|
92
|
+
streamed_logger.info(nice_output(response, pad=pad, remove_new_line=False, width=line_width))
|
|
93
|
+
buffer = [response]
|
|
94
|
+
else:
|
|
95
|
+
buffer = []
|
|
96
|
+
for chunk in response:
|
|
97
|
+
streamed_logger.info(chunk)
|
|
98
|
+
buffer.append(str(chunk))
|
|
99
|
+
|
|
100
|
+
streamed_logger.info("\n\n")
|
|
80
101
|
|
|
81
102
|
# Collecting the answer for the next turn.
|
|
82
|
-
data_dict[prompt_args["out"]] =
|
|
103
|
+
data_dict[prompt_args["out"]] = "".join(buffer)
|
|
@@ -2,12 +2,11 @@ class SchemaService(object):
|
|
|
2
2
|
|
|
3
3
|
def __init__(self, json_data):
|
|
4
4
|
self.src = json_data
|
|
5
|
-
self.name = self.src["name"]
|
|
6
5
|
self.r2p, self.p2r, self.cot_args, self.chain = SchemaService.__init_schema(prompts=json_data["schema"])
|
|
7
6
|
|
|
8
7
|
@classmethod
|
|
9
8
|
def from_prompt(cls, prompt):
|
|
10
|
-
prompt_schema = {"
|
|
9
|
+
prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
|
|
11
10
|
return cls(prompt_schema)
|
|
12
11
|
|
|
13
12
|
@staticmethod
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def StreamedLogger(name: str) -> logging.Logger:
|
|
5
|
+
""" https://medium.com/@r.das699/optimizing-logging-practices-for-streaming-data-in-python-521798e1ed82
|
|
6
|
+
"""
|
|
7
|
+
root_handlers = logging.getLogger().handlers
|
|
8
|
+
current_logger = logging.getLogger(name)
|
|
9
|
+
if not root_handlers:
|
|
10
|
+
new_handler = logging.StreamHandler()
|
|
11
|
+
new_handler.terminator = ""
|
|
12
|
+
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
13
|
+
current_logger.addHandler(new_handler)
|
|
14
|
+
current_logger.propagate = False
|
|
15
|
+
current_logger.setLevel(logging.INFO)
|
|
16
|
+
return current_logger
|
|
17
|
+
|
|
18
|
+
for handler in current_logger.handlers[:]:
|
|
19
|
+
current_logger.removeHandler(handler)
|
|
20
|
+
|
|
21
|
+
for handler_r in root_handlers:
|
|
22
|
+
if type(handler_r) is logging.StreamHandler:
|
|
23
|
+
new_handler = logging.StreamHandler()
|
|
24
|
+
new_handler.terminator = ""
|
|
25
|
+
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
26
|
+
current_logger.addHandler(new_handler)
|
|
27
|
+
elif type(handler_r) is logging.FileHandler:
|
|
28
|
+
new_handler = logging.FileHandler(
|
|
29
|
+
handler_r.baseFilename,
|
|
30
|
+
handler_r.mode,
|
|
31
|
+
handler_r.encoding,
|
|
32
|
+
handler_r.delay,
|
|
33
|
+
handler_r.errors,
|
|
34
|
+
)
|
|
35
|
+
new_handler.terminator = "" # This will stop the printing in new line
|
|
36
|
+
new_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
37
|
+
current_logger.addHandler(new_handler)
|
|
38
|
+
else:
|
|
39
|
+
continue
|
|
40
|
+
current_logger.propagate = False # Don't propagate to root logger
|
|
41
|
+
return current_logger
|
bulk_chain/demo.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from source_iter.service_jsonl import JsonlService
|
|
8
|
+
|
|
9
|
+
from bulk_chain.api import init_llm
|
|
10
|
+
from bulk_chain.core.service_args import CmdArgsService
|
|
11
|
+
from bulk_chain.core.service_json import JsonService
|
|
12
|
+
from bulk_chain.core.service_llm import chat_with_lm
|
|
13
|
+
from bulk_chain.core.service_schema import SchemaService
|
|
14
|
+
from bulk_chain.core.utils import parse_filepath
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
logging.basicConfig(level=logging.INFO)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def iter_from_json(filepath):
|
|
21
|
+
with open(filepath, "r") as f:
|
|
22
|
+
content = json.load(f)
|
|
23
|
+
for key, value in content.items():
|
|
24
|
+
yield key, value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def iter_from_text_file(filepath):
|
|
28
|
+
with open(filepath, "r") as f:
|
|
29
|
+
yield filepath.split('.')[0], f.read()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
if __name__ == '__main__':
|
|
33
|
+
|
|
34
|
+
parser = argparse.ArgumentParser(description="LLM demo usage based on CoT schema")
|
|
35
|
+
parser.add_argument('--adapter', dest='adapter', type=str, default=None)
|
|
36
|
+
parser.add_argument('--attempts', dest='attempts', type=int, default=None)
|
|
37
|
+
parser.add_argument('--src', dest='src', type=str, nargs="*", default=None)
|
|
38
|
+
parser.add_argument('--schema', dest='schema', type=str, default=None,
|
|
39
|
+
help="Path to the JSON file that describes schema")
|
|
40
|
+
parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
|
|
41
|
+
help="Optional trimming prompt by the specified amount of characters.")
|
|
42
|
+
|
|
43
|
+
# Extract native arguments.
|
|
44
|
+
native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
|
|
45
|
+
args = parser.parse_args(args=native_args[1:])
|
|
46
|
+
|
|
47
|
+
# Extract model-related arguments and Initialize Large Language Model.
|
|
48
|
+
model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
|
|
49
|
+
model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
|
|
50
|
+
llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
|
|
51
|
+
|
|
52
|
+
# Setup schema.
|
|
53
|
+
schema = SchemaService(json_data=JsonService.read(args.schema))
|
|
54
|
+
schema_name = schema.src.get("name", None)
|
|
55
|
+
if schema is not None:
|
|
56
|
+
logger.info(f"Using schema: {schema_name}")
|
|
57
|
+
|
|
58
|
+
output_providers = {
|
|
59
|
+
"jsonl": lambda filepath, data_it, header:
|
|
60
|
+
JsonlService.write(target=filepath,
|
|
61
|
+
data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
input_file_handlers = {
|
|
65
|
+
"json": lambda filepath: iter_from_json(filepath),
|
|
66
|
+
"txt": lambda filepath: iter_from_text_file(filepath)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Input extension type defines the provider.
|
|
70
|
+
if args.src is None:
|
|
71
|
+
args.src = []
|
|
72
|
+
if isinstance(args.src, str):
|
|
73
|
+
args.src = [args.src]
|
|
74
|
+
sources = [parse_filepath(s) for s in args.src]
|
|
75
|
+
|
|
76
|
+
preset_dict = {}
|
|
77
|
+
for fp, ext, _ in sources:
|
|
78
|
+
for key, value in input_file_handlers[ext](fp):
|
|
79
|
+
if key in preset_dict:
|
|
80
|
+
raise Exception(f"While at handling {fp}: Key {key} is already registered!")
|
|
81
|
+
preset_dict[key] = value
|
|
82
|
+
|
|
83
|
+
# Launch Demo.
|
|
84
|
+
chat_with_lm(llm, preset_dict=preset_dict, chain=schema.chain, model_name=llm_model_name,
|
|
85
|
+
line_width=120)
|
bulk_chain/infer.py
CHANGED
|
@@ -1,98 +1,67 @@
|
|
|
1
|
-
import os
|
|
2
1
|
from os.path import join, basename
|
|
3
2
|
|
|
4
3
|
import argparse
|
|
5
4
|
import logging
|
|
6
5
|
import sys
|
|
7
6
|
|
|
8
|
-
from tqdm import tqdm
|
|
9
|
-
|
|
10
7
|
from source_iter.service_csv import CsvService
|
|
11
8
|
from source_iter.service_jsonl import JsonlService
|
|
12
9
|
from source_iter.service_sqlite import SQLite3Service
|
|
10
|
+
from tqdm import tqdm
|
|
13
11
|
|
|
12
|
+
from bulk_chain.api import INFER_MODES, _infer_batch, CWD, init_llm
|
|
14
13
|
from bulk_chain.core.llm_base import BaseLM
|
|
15
14
|
from bulk_chain.core.service_args import CmdArgsService
|
|
16
|
-
from bulk_chain.core.
|
|
15
|
+
from bulk_chain.core.service_dict import DictionaryService
|
|
17
16
|
from bulk_chain.core.service_json import JsonService
|
|
18
|
-
from bulk_chain.core.service_llm import chat_with_lm
|
|
19
17
|
from bulk_chain.core.service_schema import SchemaService
|
|
20
|
-
from bulk_chain.core.utils import
|
|
18
|
+
from bulk_chain.core.utils import handle_table_name, optional_limit_iter, parse_filepath
|
|
21
19
|
|
|
22
20
|
logger = logging.getLogger(__name__)
|
|
23
21
|
logging.basicConfig(level=logging.INFO)
|
|
24
22
|
|
|
23
|
+
WRITER_PROVIDERS = {
|
|
24
|
+
"sqlite": lambda filepath, table_name, data_it, infer_data_func, **kwargs: SQLite3Service.write(
|
|
25
|
+
data_it=data_it, target=filepath, table_name=table_name, data2col_func=infer_data_func,
|
|
26
|
+
skip_existed=True, **kwargs)
|
|
27
|
+
}
|
|
25
28
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def init_llm(**model_kwargs):
|
|
30
|
-
""" This method perform dynamic initialization of LLM from third-party resource.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
# List of the Supported models and their API wrappers.
|
|
34
|
-
models_preset = {
|
|
35
|
-
"dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
|
|
36
|
-
class_name=llm_model_params)(**model_kwargs)
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
# Initialize LLM model.
|
|
40
|
-
params = args.adapter.split(':')
|
|
41
|
-
llm_model_type = params[0]
|
|
42
|
-
llm_model_name = params[1] if len(params) > 1 else params[-1]
|
|
43
|
-
llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
|
|
44
|
-
llm = find_by_prefix(d=models_preset, key=llm_model_type)()
|
|
45
|
-
|
|
46
|
-
return llm, llm_model_name
|
|
47
|
-
|
|
29
|
+
READER_PROVIDERS = {
|
|
30
|
+
"sqlite": lambda filepath, table_name: SQLite3Service.read(filepath, table=table_name)
|
|
31
|
+
}
|
|
48
32
|
|
|
49
|
-
def init_schema(json_filepath):
|
|
50
|
-
return SchemaService(json_data=JsonService.read(json_filepath))
|
|
51
33
|
|
|
52
|
-
|
|
53
|
-
def iter_content(input_dicts_iter, llm, schema, cache_target, cache_table, id_column_name):
|
|
54
|
-
""" This method represent Python API aimed at application of `llm` towards
|
|
55
|
-
iterator of input_dicts via cache_target that refers to the SQLite using
|
|
56
|
-
the given `schema`
|
|
57
|
-
"""
|
|
34
|
+
def iter_content_cached(input_dicts_it, llm, schema, cache_target, limit_prompt=None, **cache_kwargs):
|
|
58
35
|
assert (isinstance(llm, BaseLM))
|
|
59
|
-
assert (isinstance(schema, SchemaService))
|
|
60
36
|
assert (isinstance(cache_target, str))
|
|
61
|
-
assert (isinstance(cache_table, str))
|
|
62
|
-
|
|
63
|
-
infer_modes = {
|
|
64
|
-
"default": lambda prompt: llm.ask_safe(prompt[:args.limit_prompt] if args.limit_prompt is not None else prompt)
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
def optional_update_data_records(c, data):
|
|
68
|
-
assert (isinstance(c, str))
|
|
69
|
-
|
|
70
|
-
if c in schema.p2r:
|
|
71
|
-
data[c] = DataService.get_prompt_text(prompt=data[c]["prompt"], data_dict=data)
|
|
72
|
-
if c in schema.r2p:
|
|
73
|
-
p_column = schema.r2p[c]
|
|
74
|
-
# This instruction takes a lot of time in a non-batching mode.
|
|
75
|
-
data[c] = infer_modes["default"](data[p_column])
|
|
76
37
|
|
|
77
|
-
|
|
38
|
+
# Quick initialization of the schema.
|
|
39
|
+
if isinstance(schema, str):
|
|
40
|
+
schema = JsonService.read(schema)
|
|
41
|
+
if isinstance(schema, dict):
|
|
42
|
+
schema = SchemaService(json_data=schema)
|
|
78
43
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
id_column_name=id_column_name)
|
|
85
|
-
}
|
|
44
|
+
# Iterator of the queries.
|
|
45
|
+
prompts_it = map(
|
|
46
|
+
lambda data: DictionaryService.custom_update(src_dict=data, other_dict=schema.cot_args),
|
|
47
|
+
input_dicts_it
|
|
48
|
+
)
|
|
86
49
|
|
|
87
|
-
#
|
|
88
|
-
|
|
89
|
-
it_data=map(lambda data: data.update(schema.cot_args) or data, input_dicts_iter),
|
|
90
|
-
limit=args.limit)
|
|
50
|
+
# Parse target.
|
|
51
|
+
cache_filepath, _, cache_table = parse_filepath(filepath=cache_target)
|
|
91
52
|
|
|
92
|
-
#
|
|
93
|
-
|
|
53
|
+
# Perform caching first.
|
|
54
|
+
WRITER_PROVIDERS["sqlite"](
|
|
55
|
+
filepath=cache_filepath, table_name=cache_table,
|
|
56
|
+
data_it=tqdm(prompts_it, desc="Iter content"),
|
|
57
|
+
infer_data_func=lambda c, prompt: _infer_batch(
|
|
58
|
+
batch=[prompt], cols=[c],
|
|
59
|
+
infer_func=lambda batch: INFER_MODES["default"](llm, batch, limit_prompt),
|
|
60
|
+
schema=schema)[0][c],
|
|
61
|
+
**cache_kwargs)
|
|
94
62
|
|
|
95
|
-
|
|
63
|
+
# Then retrieve data.
|
|
64
|
+
return READER_PROVIDERS["sqlite"](filepath=cache_filepath, table_name=cache_table)
|
|
96
65
|
|
|
97
66
|
|
|
98
67
|
if __name__ == '__main__':
|
|
@@ -101,7 +70,7 @@ if __name__ == '__main__':
|
|
|
101
70
|
parser.add_argument('--adapter', dest='adapter', type=str, default=None)
|
|
102
71
|
parser.add_argument('--attempts', dest='attempts', type=int, default=None)
|
|
103
72
|
parser.add_argument('--id-col', dest='id_col', type=str, default="uid")
|
|
104
|
-
parser.add_argument('--src', dest='src', type=str, default=None)
|
|
73
|
+
parser.add_argument('--src', dest='src', type=str, nargs="?", default=None)
|
|
105
74
|
parser.add_argument('--schema', dest='schema', type=str, default=None,
|
|
106
75
|
help="Path to the JSON file that describes schema")
|
|
107
76
|
parser.add_argument('--to', dest='to', type=str, default=None, choices=["csv", "sqlite"])
|
|
@@ -111,31 +80,48 @@ if __name__ == '__main__':
|
|
|
111
80
|
parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
|
|
112
81
|
help="Optional trimming prompt by the specified amount of characters.")
|
|
113
82
|
|
|
114
|
-
|
|
115
|
-
|
|
83
|
+
# Extract native arguments.
|
|
84
|
+
native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
|
|
116
85
|
args = parser.parse_args(args=native_args[1:])
|
|
117
86
|
|
|
118
|
-
#
|
|
87
|
+
# Extract csv-related arguments.
|
|
88
|
+
csv_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%csv", end_prefix="%%")
|
|
89
|
+
csv_args_dict = CmdArgsService.args_to_dict(csv_args)
|
|
90
|
+
|
|
91
|
+
# Extract model-related arguments and Initialize Large Language Model.
|
|
92
|
+
model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
|
|
119
93
|
model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
|
|
120
|
-
llm, llm_model_name = init_llm(**model_args_dict)
|
|
94
|
+
llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
|
|
121
95
|
|
|
122
96
|
# Setup schema.
|
|
123
|
-
schema =
|
|
97
|
+
schema = SchemaService(json_data=JsonService.read(args.schema))
|
|
98
|
+
schema_name = schema.src.get("name", None)
|
|
124
99
|
if schema is not None:
|
|
125
|
-
logger.info(f"Using schema: {
|
|
100
|
+
logger.info(f"Using schema: {schema_name}")
|
|
126
101
|
|
|
127
102
|
input_providers = {
|
|
128
|
-
None: lambda _: chat_with_lm(llm, chain=schema.chain, model_name=llm_model_name),
|
|
129
103
|
"csv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
|
|
130
104
|
as_dict=True, skip_header=True,
|
|
131
|
-
delimiter=
|
|
132
|
-
escapechar=
|
|
105
|
+
delimiter=csv_args_dict.get("delimiter", ","),
|
|
106
|
+
escapechar=csv_args_dict.get("escapechar", None)),
|
|
107
|
+
"tsv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
|
|
108
|
+
as_dict=True, skip_header=True,
|
|
109
|
+
delimiter=csv_args_dict.get("delimiter", "\t"),
|
|
110
|
+
escapechar=csv_args_dict.get("escapechar", None)),
|
|
133
111
|
"jsonl": lambda filepath: JsonlService.read(src=filepath, row_id_key=args.id_col)
|
|
134
112
|
}
|
|
135
113
|
|
|
136
114
|
output_providers = {
|
|
137
|
-
"csv": lambda filepath, data_it, header:
|
|
138
|
-
|
|
115
|
+
"csv": lambda filepath, data_it, header: CsvService.write(target=filepath,
|
|
116
|
+
data_it=data_it, header=header,
|
|
117
|
+
delimiter=csv_args_dict.get("delimiter", ","),
|
|
118
|
+
escapechar=csv_args_dict.get("escapechar", None),
|
|
119
|
+
it_type=None),
|
|
120
|
+
"tsv": lambda filepath, data_it, header: CsvService.write(target=filepath,
|
|
121
|
+
data_it=data_it, header=header,
|
|
122
|
+
delimiter=csv_args_dict.get("delimiter", "\t"),
|
|
123
|
+
escapechar=csv_args_dict.get("escapechar", None),
|
|
124
|
+
it_type=None),
|
|
139
125
|
"jsonl": lambda filepath, data_it, header:
|
|
140
126
|
JsonlService.write(target=filepath,
|
|
141
127
|
data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
|
|
@@ -145,32 +131,32 @@ if __name__ == '__main__':
|
|
|
145
131
|
args.output = args.output.format(model=llm.name()) if args.output is not None else args.output
|
|
146
132
|
tgt_filepath, tgt_ext, tgt_meta = parse_filepath(args.output, default_ext=args.to)
|
|
147
133
|
|
|
148
|
-
#
|
|
134
|
+
# We do not support multiple files for other modes.
|
|
149
135
|
src_filepath, src_ext, src_meta = parse_filepath(args.src)
|
|
150
136
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
exit(0)
|
|
137
|
+
def default_output_file_template(ext):
|
|
138
|
+
# This is a default template for output files to be generated.
|
|
139
|
+
return "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema_name]), ext])
|
|
155
140
|
|
|
156
141
|
# Setup cache target as well as the related table.
|
|
157
|
-
|
|
158
|
-
if tgt_filepath is None else tgt_filepath
|
|
142
|
+
cache_filepath = default_output_file_template(".sqlite") if tgt_filepath is None else tgt_filepath
|
|
159
143
|
cache_table = handle_table_name(tgt_meta if tgt_meta is not None else "contents")
|
|
160
144
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
145
|
+
# This is a content that we extracted via input provider.
|
|
146
|
+
it_data = input_providers[src_ext](src_filepath)
|
|
147
|
+
|
|
148
|
+
data_it = iter_content_cached(input_dicts_it=optional_limit_iter(it_data=it_data, limit=args.limit),
|
|
149
|
+
limit_prompt=args.limit_prompt,
|
|
150
|
+
schema=schema,
|
|
151
|
+
llm=llm,
|
|
152
|
+
id_column_name=args.id_col,
|
|
153
|
+
cache_target=":".join([cache_filepath, cache_table]))
|
|
167
154
|
|
|
168
155
|
# Setup output target
|
|
169
156
|
tgt_ext = src_ext if tgt_ext is None else tgt_ext
|
|
170
|
-
output_target =
|
|
171
|
-
if tgt_filepath is None else tgt_filepath
|
|
157
|
+
output_target = default_output_file_template(f".{tgt_ext}") if tgt_filepath is None else tgt_filepath
|
|
172
158
|
|
|
173
159
|
# Perform output writing process.
|
|
174
160
|
output_providers[tgt_ext](filepath=output_target,
|
|
175
161
|
data_it=data_it,
|
|
176
|
-
header=SQLite3Service.read_columns(target=
|
|
162
|
+
header=SQLite3Service.read_columns(target=cache_filepath, table=cache_table))
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: bulk_chain
|
|
3
|
+
Version: 0.25.1
|
|
4
|
+
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
|
+
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
|
+
Author: Nicolay Rusnachenko
|
|
7
|
+
Author-email: rusnicolay@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Keywords: natural language processing,chain-of-thought,reasoning
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Requires-Python: >=3.6
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: source-iter ==0.24.3
|
|
20
|
+
|
|
21
|
+
# bulk-chain 0.25.1
|
|
22
|
+

|
|
23
|
+
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
24
|
+
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
25
|
+
[](https://pypistats.org/packages/bulk-chain)
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<img src="logo.png"/>
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
|
|
36
|
+
|
|
37
|
+
### Main Features
|
|
38
|
+
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
39
|
+
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
40
|
+
* ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
|
|
41
|
+
|
|
42
|
+
### Extra Features
|
|
43
|
+
* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Installation
|
|
47
|
+
|
|
48
|
+
From PyPI:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install --no-deps bulk-chain
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
or latest version from here:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install git+https://github.com/nicolay-r/bulk-chain@master
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Chain-of-Thought Schema
|
|
61
|
+
|
|
62
|
+
To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
|
|
63
|
+
This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
|
|
64
|
+
|
|
65
|
+
Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
66
|
+
All the variable names are expected to be mentioned in `{}`.
|
|
67
|
+
|
|
68
|
+
Below, is an example on how to declare your own schema:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
{
|
|
72
|
+
"name": "schema-name",
|
|
73
|
+
"schema": [
|
|
74
|
+
{"prompt": "Given the question '{text}', let's think step-by-step.",
|
|
75
|
+
"out": "steps"},
|
|
76
|
+
{"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
|
|
77
|
+
"out": "answer"},
|
|
78
|
+
]
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
# Usage
|
|
83
|
+
|
|
84
|
+
Preliminary steps:
|
|
85
|
+
|
|
86
|
+
1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
|
|
87
|
+
2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
88
|
+
|
|
89
|
+
## API
|
|
90
|
+
|
|
91
|
+
Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
|
|
92
|
+
|
|
93
|
+
## Shell
|
|
94
|
+
|
|
95
|
+
> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
|
|
96
|
+
|
|
97
|
+
1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
|
|
98
|
+
```bash
|
|
99
|
+
wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
|
|
100
|
+
```
|
|
101
|
+
2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
|
|
102
|
+
3. 🚀 Launch inference using `DeepSeek-R1`:
|
|
103
|
+
```bash
|
|
104
|
+
python3 -m bulk_chain.infer \
|
|
105
|
+
--src "<PATH-TO-YOUR-CSV-or-JSONL>" \
|
|
106
|
+
--schema "test/schema/default.json" \
|
|
107
|
+
--adapter "replicate_104.py:Replicate" \
|
|
108
|
+
%%m \
|
|
109
|
+
--model_name "deepseek-ai/deepseek-r1" \
|
|
110
|
+
--api_token "<REPLICATE-API-TOKEN>"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Or, you can launch **demo mode** to interact with LLM via command line:
|
|
114
|
+
> **NOTE:** Demo supports streaming!
|
|
115
|
+
```bash
|
|
116
|
+
python3 -m bulk_chain.demo \
|
|
117
|
+
--schema "test/schema/thor_cot_schema.json" \
|
|
118
|
+
--adapter "dynamic:replicate_104.py:Replicate" \
|
|
119
|
+
%%m \
|
|
120
|
+
--model_name "meta/meta-llama-3-70b-instruct" \
|
|
121
|
+
--api_token "<REPLICATE-API-TOKEN>" \
|
|
122
|
+
--stream
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
# Embed your LLM
|
|
126
|
+
|
|
127
|
+
All you have to do is to implement `BaseLM` class, that includes:
|
|
128
|
+
* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
|
|
129
|
+
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
130
|
+
|
|
131
|
+
See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
bulk_chain/api.py,sha256=3q1t4A5wop_BRgYanFCCSQBiGu38P9ds0hTbuxNIUKQ,3590
|
|
3
|
+
bulk_chain/demo.py,sha256=x5OPyc7NSSr9KsfSx09KQkntJIev6gytqdhtx4b9FdU,3224
|
|
4
|
+
bulk_chain/infer.py,sha256=VsqP8CP1pHBNeaDYAv__W6EPL8qBibuAtHzl4-S54iU,8148
|
|
5
|
+
bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
bulk_chain/core/llm_base.py,sha256=uX_uibm5y8STfMKNYL64EeF8UowfJGwCD_t-uftHoJE,1849
|
|
7
|
+
bulk_chain/core/service_args.py,sha256=lq4Veuh4QNu8mlCv8MT9S1rMxTn4FKalyp-3boYonVk,2136
|
|
8
|
+
bulk_chain/core/service_batch.py,sha256=yQr6fbQd4ifQBGMhZMrQQeZpXtDchMKMGJi8XPG7thc,1430
|
|
9
|
+
bulk_chain/core/service_data.py,sha256=ZjJDtd1jrQm9hRCXMqe4CT_qF2XDbWBE1lVibP7tAWo,942
|
|
10
|
+
bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
|
|
11
|
+
bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
|
|
12
|
+
bulk_chain/core/service_llm.py,sha256=uxtEw9OANFyjS4IECos_N5pUfAPQZVul61m64gnlwt0,3880
|
|
13
|
+
bulk_chain/core/service_schema.py,sha256=KIP4n0Tz2h1i7SIMGhgAhoiCgUFXOT1rzMt38yACS2U,1154
|
|
14
|
+
bulk_chain/core/utils.py,sha256=UV6Cefaw7yZiYblsCr-s9LsbcI83xe7eESBvha9A2Og,2784
|
|
15
|
+
bulk_chain/core/utils_logger.py,sha256=BD-ADxaeeuHztaYjqtIY_cIzc5r2Svq9XwRtrgIEqyI,1636
|
|
16
|
+
bulk_chain-0.25.1.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
|
|
17
|
+
bulk_chain-0.25.1.dist-info/METADATA,sha256=EQNb6FwovXUSNXL5QpWf-3003863kL84ZWU7wya3ZeM,5071
|
|
18
|
+
bulk_chain-0.25.1.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
|
|
19
|
+
bulk_chain-0.25.1.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
|
|
20
|
+
bulk_chain-0.25.1.dist-info/RECORD,,
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: bulk_chain
|
|
3
|
-
Version: 0.24.2
|
|
4
|
-
Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
|
|
5
|
-
Home-page: https://github.com/nicolay-r/bulk-chain
|
|
6
|
-
Author: Nicolay Rusnachenko
|
|
7
|
-
Author-email: rusnicolay@gmail.com
|
|
8
|
-
License: MIT License
|
|
9
|
-
Keywords: natural language processing,chain-of-thought,reasoning
|
|
10
|
-
Classifier: Programming Language :: Python
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
-
Requires-Python: >=3.6
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
License-File: LICENSE
|
|
18
|
-
Requires-Dist: tqdm
|
|
19
|
-
Requires-Dist: source-iter ==0.24.2
|
|
20
|
-
|
|
21
|
-
# bulk-chain 0.24.2
|
|
22
|
-

|
|
23
|
-
[](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
|
|
24
|
-
[](https://x.com/nicolayr_/status/1847969224636961033)
|
|
25
|
-
|
|
26
|
-
<p align="center">
|
|
27
|
-
<img src="logo.png"/>
|
|
28
|
-
</p>
|
|
29
|
-
|
|
30
|
-
A lightweight, no-strings-attached **[Chain-of-Thought](https://arxiv.org/abs/2201.11903) framework** for your LLM, ensuring reliable results for bulk input requests stored in `CSV` / `JSONL` / `sqlite`.
|
|
31
|
-
It allows applying series of prompts formed into `schema` (See [related section](#chain-of-thought-schema))
|
|
32
|
-
|
|
33
|
-
### Features
|
|
34
|
-
* ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
|
|
35
|
-
* ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
|
|
36
|
-
* ✅ **Progress caching**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
|
|
37
|
-
* ✅ **Support schemas descriptions** for Chain-of-Thought concept.
|
|
38
|
-
|
|
39
|
-
# Installation
|
|
40
|
-
|
|
41
|
-
```bash
|
|
42
|
-
pip install bulk-chain
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
## Chain-of-Thought Schema
|
|
46
|
-
|
|
47
|
-
To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
|
|
48
|
-
This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
|
|
49
|
-
|
|
50
|
-
Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
|
|
51
|
-
All the variable names are expected to be mentioned in `{}`.
|
|
52
|
-
|
|
53
|
-
Below, is an example on how to declare your own schema:
|
|
54
|
-
|
|
55
|
-
```python
|
|
56
|
-
{
|
|
57
|
-
"name": "schema-name",
|
|
58
|
-
"schema": [
|
|
59
|
-
{"prompt": "Given the question '{text}', let's think step-by-step.",
|
|
60
|
-
"out": "steps"},
|
|
61
|
-
{"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
|
|
62
|
-
"out": "answer"},
|
|
63
|
-
]
|
|
64
|
-
}
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
Another templates are available [here](/ext/schema/thor_cot_schema.json).
|
|
68
|
-
|
|
69
|
-
# Usage
|
|
70
|
-
|
|
71
|
-
Just **three** simple steps:
|
|
72
|
-
|
|
73
|
-
1. Define your [CoT Schema](#chain-of-thought-schema), or fetch it as shown below:
|
|
74
|
-
```bash
|
|
75
|
-
!wget https://raw.githubusercontent.com/nicolay-r/bulk-chain/refs/heads/master/ext/schema/default.json
|
|
76
|
-
```
|
|
77
|
-
2. Fetch or write your own **model** or pick the one [preset here](/ext/):
|
|
78
|
-
```bash
|
|
79
|
-
!wget https://raw.githubusercontent.com/nicolay-r/bulk-chain/refs/heads/master/ext/flan_t5.py
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
3. Launch inference in (chat mode):
|
|
83
|
-
```bash
|
|
84
|
-
!python -m bulk_chain.infer \
|
|
85
|
-
--schema "default.json" \
|
|
86
|
-
--adapter "dynamic:flan_t5.py:FlanT5" \
|
|
87
|
-
%% \
|
|
88
|
-
--device "cpu" \
|
|
89
|
-
--temp 0.1
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
# Embed your LLM
|
|
93
|
-
|
|
94
|
-
All you have to do is to implement `BaseLM` class, that includes:
|
|
95
|
-
* `__init__` -- for initialization;
|
|
96
|
-
* `ask(prompt)` -- infer your model with the given `prompt`.
|
|
97
|
-
|
|
98
|
-
See examples with models [here](/ext).
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
bulk_chain/infer.py,sha256=QgbR64A1JS8B9oh0_ruynEfdCpoG1rPHVMtk5Z0Ch2U,7476
|
|
3
|
-
bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
bulk_chain/core/llm_base.py,sha256=Cakuuy4jTVPOta5TyEPFTPGvFpJfM6m0dAIAYSu7zFM,1008
|
|
5
|
-
bulk_chain/core/service_args.py,sha256=Qr3rHsAB8wnajB-DbU-GjiEpRZFP4D6s1lVTpLkPPX4,1294
|
|
6
|
-
bulk_chain/core/service_data.py,sha256=18gQwSCTEsI7XFukq8AE5lDJX_QQRpasaH69g6EddV0,797
|
|
7
|
-
bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
|
|
8
|
-
bulk_chain/core/service_llm.py,sha256=NoD5KHGtXCmN8SlpgH0Z5KCmSxZcnVVfp65vhVRoG84,2742
|
|
9
|
-
bulk_chain/core/service_schema.py,sha256=JVhOv2YP5VEtiwOq_zgCzhS2uF_BOATAgg6fmKRf2NQ,1209
|
|
10
|
-
bulk_chain/core/utils.py,sha256=UV6Cefaw7yZiYblsCr-s9LsbcI83xe7eESBvha9A2Og,2784
|
|
11
|
-
bulk_chain-0.24.2.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
|
|
12
|
-
bulk_chain-0.24.2.dist-info/METADATA,sha256=yEKF0X90AvNw6yq-W7oBdTLiH8KSrCKRHaERhOFLXFA,3685
|
|
13
|
-
bulk_chain-0.24.2.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
|
|
14
|
-
bulk_chain-0.24.2.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
|
|
15
|
-
bulk_chain-0.24.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|