bulk-chain 0.24.2__tar.gz → 0.25.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. bulk_chain-0.25.1/PKG-INFO +131 -0
  2. bulk_chain-0.25.1/README.md +111 -0
  3. bulk_chain-0.25.1/bulk_chain/api.py +99 -0
  4. bulk_chain-0.25.1/bulk_chain/core/llm_base.py +52 -0
  5. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/core/service_args.py +31 -7
  6. bulk_chain-0.25.1/bulk_chain/core/service_batch.py +51 -0
  7. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/core/service_data.py +4 -0
  8. bulk_chain-0.25.1/bulk_chain/core/service_dict.py +10 -0
  9. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/core/service_llm.py +36 -15
  10. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/core/service_schema.py +1 -2
  11. bulk_chain-0.25.1/bulk_chain/core/utils_logger.py +41 -0
  12. bulk_chain-0.25.1/bulk_chain/demo.py +85 -0
  13. bulk_chain-0.25.1/bulk_chain/infer.py +162 -0
  14. bulk_chain-0.25.1/bulk_chain.egg-info/PKG-INFO +131 -0
  15. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain.egg-info/SOURCES.txt +9 -1
  16. bulk_chain-0.25.1/bulk_chain.egg-info/requires.txt +2 -0
  17. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/setup.py +1 -1
  18. bulk_chain-0.25.1/test/test.py +62 -0
  19. bulk_chain-0.25.1/test/test_api.py +43 -0
  20. bulk_chain-0.25.1/test/test_cmdargs.py +29 -0
  21. bulk_chain-0.25.1/test/test_provider_batching.py +31 -0
  22. bulk_chain-0.24.2/PKG-INFO +0 -98
  23. bulk_chain-0.24.2/README.md +0 -78
  24. bulk_chain-0.24.2/bulk_chain/core/llm_base.py +0 -35
  25. bulk_chain-0.24.2/bulk_chain/infer.py +0 -176
  26. bulk_chain-0.24.2/bulk_chain.egg-info/PKG-INFO +0 -98
  27. bulk_chain-0.24.2/bulk_chain.egg-info/requires.txt +0 -2
  28. bulk_chain-0.24.2/test/test_cmdargs.py +0 -9
  29. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/LICENSE +0 -0
  30. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/__init__.py +0 -0
  31. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/core/__init__.py +0 -0
  32. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/core/service_json.py +0 -0
  33. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain/core/utils.py +0 -0
  34. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain.egg-info/dependency_links.txt +0 -0
  35. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/bulk_chain.egg-info/top_level.txt +0 -0
  36. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/setup.cfg +0 -0
  37. {bulk_chain-0.24.2 → bulk_chain-0.25.1}/test/test_args_seeking.py +0 -0
@@ -0,0 +1,131 @@
1
+ Metadata-Version: 2.1
2
+ Name: bulk_chain
3
+ Version: 0.25.1
4
+ Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
+ Home-page: https://github.com/nicolay-r/bulk-chain
6
+ Author: Nicolay Rusnachenko
7
+ Author-email: rusnicolay@gmail.com
8
+ License: MIT License
9
+ Keywords: natural language processing,chain-of-thought,reasoning
10
+ Classifier: Programming Language :: Python
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
14
+ Classifier: Topic :: Text Processing :: Linguistic
15
+ Requires-Python: >=3.6
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: source-iter==0.24.3
20
+
21
+ # bulk-chain 0.25.1
22
+ ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
23
+ [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
24
+ [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
25
+ [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
26
+
27
+ <p align="center">
28
+ <img src="logo.png"/>
29
+ </p>
30
+
31
+ <p align="center">
32
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
33
+ </p>
34
+
35
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
36
+
37
+ ### Main Features
38
+ * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
39
+ * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
40
+ * ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
41
+
42
+ ### Extra Features
43
+ * ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
44
+
45
+
46
+ # Installation
47
+
48
+ From PyPI:
49
+
50
+ ```bash
51
+ pip install --no-deps bulk-chain
52
+ ```
53
+
54
+ or latest version from here:
55
+
56
+ ```bash
57
+ pip install git+https://github.com/nicolay-r/bulk-chain@master
58
+ ```
59
+
60
+ ## Chain-of-Thought Schema
61
+
62
+ To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
63
+ This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
64
+
65
+ Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
66
+ All the variable names are expected to be mentioned in `{}`.
67
+
68
+ Below, is an example on how to declare your own schema:
69
+
70
+ ```python
71
+ {
72
+ "name": "schema-name",
73
+ "schema": [
74
+ {"prompt": "Given the question '{text}', let's think step-by-step.",
75
+ "out": "steps"},
76
+ {"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
77
+ "out": "answer"},
78
+ ]
79
+ }
80
+ ```
81
+
82
+ # Usage
83
+
84
+ Preliminary steps:
85
+
86
+ 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
87
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
88
+
89
+ ## API
90
+
91
+ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
92
+
93
+ ## Shell
94
+
95
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
96
+
97
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
98
+ ```bash
99
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
100
+ ```
101
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
102
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
103
+ ```bash
104
+ python3 -m bulk_chain.infer \
105
+ --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
106
+ --schema "test/schema/default.json" \
107
+ --adapter "replicate_104.py:Replicate" \
108
+ %%m \
109
+ --model_name "deepseek-ai/deepseek-r1" \
110
+ --api_token "<REPLICATE-API-TOKEN>"
111
+ ```
112
+
113
+ Or, you can launch **demo mode** to interact with LLM via command line:
114
+ > **NOTE:** Demo supports streaming!
115
+ ```bash
116
+ python3 -m bulk_chain.demo \
117
+ --schema "test/schema/thor_cot_schema.json" \
118
+ --adapter "dynamic:replicate_104.py:Replicate" \
119
+ %%m \
120
+ --model_name "meta/meta-llama-3-70b-instruct" \
121
+ --api_token "<REPLICATE-API-TOKEN>" \
122
+ --stream
123
+ ```
124
+
125
+ # Embed your LLM
126
+
127
+ All you have to do is to implement `BaseLM` class, that includes:
128
+ * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
129
+ * `ask(prompt)` -- infer your model with the given `prompt`.
130
+
131
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -0,0 +1,111 @@
1
+ # bulk-chain 0.25.1
2
+ ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
3
+ [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
4
+ [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
5
+ [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
6
+
7
+ <p align="center">
8
+ <img src="logo.png"/>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
13
+ </p>
14
+
15
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
16
+
17
+ ### Main Features
18
+ * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
19
+ * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
20
+ * ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
21
+
22
+ ### Extra Features
23
+ * ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
24
+
25
+
26
+ # Installation
27
+
28
+ From PyPI:
29
+
30
+ ```bash
31
+ pip install --no-deps bulk-chain
32
+ ```
33
+
34
+ or latest version from here:
35
+
36
+ ```bash
37
+ pip install git+https://github.com/nicolay-r/bulk-chain@master
38
+ ```
39
+
40
+ ## Chain-of-Thought Schema
41
+
42
+ To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
43
+ This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
44
+
45
+ Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
46
+ All the variable names are expected to be mentioned in `{}`.
47
+
48
+ Below, is an example on how to declare your own schema:
49
+
50
+ ```python
51
+ {
52
+ "name": "schema-name",
53
+ "schema": [
54
+ {"prompt": "Given the question '{text}', let's think step-by-step.",
55
+ "out": "steps"},
56
+ {"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
57
+ "out": "answer"},
58
+ ]
59
+ }
60
+ ```
61
+
62
+ # Usage
63
+
64
+ Preliminary steps:
65
+
66
+ 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
67
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
68
+
69
+ ## API
70
+
71
+ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
72
+
73
+ ## Shell
74
+
75
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
76
+
77
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
78
+ ```bash
79
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
80
+ ```
81
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
82
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
83
+ ```bash
84
+ python3 -m bulk_chain.infer \
85
+ --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
86
+ --schema "test/schema/default.json" \
87
+ --adapter "replicate_104.py:Replicate" \
88
+ %%m \
89
+ --model_name "deepseek-ai/deepseek-r1" \
90
+ --api_token "<REPLICATE-API-TOKEN>"
91
+ ```
92
+
93
+ Or, you can launch **demo mode** to interact with LLM via command line:
94
+ > **NOTE:** Demo supports streaming!
95
+ ```bash
96
+ python3 -m bulk_chain.demo \
97
+ --schema "test/schema/thor_cot_schema.json" \
98
+ --adapter "dynamic:replicate_104.py:Replicate" \
99
+ %%m \
100
+ --model_name "meta/meta-llama-3-70b-instruct" \
101
+ --api_token "<REPLICATE-API-TOKEN>" \
102
+ --stream
103
+ ```
104
+
105
+ # Embed your LLM
106
+
107
+ All you have to do is to implement `BaseLM` class, that includes:
108
+ * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
109
+ * `ask(prompt)` -- infer your model with the given `prompt`.
110
+
111
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -0,0 +1,99 @@
1
+ import os
2
+ from itertools import chain
3
+
4
+ from bulk_chain.core.llm_base import BaseLM
5
+ from bulk_chain.core.service_batch import BatchIterator, BatchService
6
+ from bulk_chain.core.service_data import DataService
7
+ from bulk_chain.core.service_dict import DictionaryService
8
+ from bulk_chain.core.service_json import JsonService
9
+ from bulk_chain.core.service_schema import SchemaService
10
+ from bulk_chain.core.utils import dynamic_init, find_by_prefix
11
+
12
+ INFER_MODES = {
13
+ "default": lambda llm, prompt, limit_prompt=None: llm.ask_core(
14
+ prompt[:limit_prompt] if limit_prompt is not None else prompt),
15
+ "batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
16
+ DataService.limit_prompts(batch, limit=limit_prompt))
17
+ }
18
+
19
+
20
+ CWD = os.getcwd()
21
+
22
+
23
+ def _update_batch_content(c, batch, schema, infer_func):
24
+ assert (isinstance(batch, list))
25
+ assert (isinstance(c, str))
26
+
27
+ if c in schema.p2r:
28
+ for batch_item in batch:
29
+ batch_item[c] = DataService.get_prompt_text(prompt=batch_item[c]["prompt"], data_dict=batch_item)
30
+ if c in schema.r2p:
31
+ p_column = schema.r2p[c]
32
+ # This instruction takes a lot of time in a non-batching mode.
33
+ BatchService.handle_param_as_batch(batch=batch,
34
+ src_param=p_column,
35
+ tgt_param=c,
36
+ handle_func=lambda b: infer_func(b))
37
+
38
+
39
+ def _infer_batch(batch, schema, infer_func, cols=None):
40
+ assert (isinstance(batch, list))
41
+ assert (callable(infer_func))
42
+
43
+ if len(batch) == 0:
44
+ return batch
45
+
46
+ if cols is None:
47
+ first_item = batch[0]
48
+ cols = first_item.keys() if cols is None else cols
49
+
50
+ for c in cols:
51
+ _update_batch_content(c=c, batch=batch, schema=schema, infer_func=infer_func)
52
+
53
+ return batch
54
+
55
+
56
+ def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, limit_prompt=None):
57
+ """ This method represent Python API aimed at application of `llm` towards
58
+ iterator of input_dicts via cache_target that refers to the SQLite using
59
+ the given `schema`
60
+ """
61
+ assert (isinstance(llm, BaseLM))
62
+
63
+ # Quick initialization of the schema.
64
+ if isinstance(schema, str):
65
+ schema = JsonService.read(schema)
66
+ if isinstance(schema, dict):
67
+ schema = SchemaService(json_data=schema)
68
+
69
+ prompts_it = map(
70
+ lambda data: DictionaryService.custom_update(src_dict=data, other_dict=schema.cot_args),
71
+ input_dicts_it
72
+ )
73
+
74
+ content_it = (_infer_batch(batch=batch,
75
+ infer_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
76
+ schema=schema)
77
+ for batch in BatchIterator(prompts_it, batch_size=batch_size))
78
+
79
+ yield from content_it if return_batch else chain.from_iterable(content_it)
80
+
81
+
82
+ def init_llm(adapter, **model_kwargs):
83
+ """ This method perform dynamic initialization of LLM from third-party resource.
84
+ """
85
+
86
+ # List of the Supported models and their API wrappers.
87
+ models_preset = {
88
+ "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
89
+ class_name=llm_model_params)(**model_kwargs)
90
+ }
91
+
92
+ # Initialize LLM model.
93
+ params = adapter.split(':')
94
+ llm_model_type = params[0]
95
+ llm_model_name = params[1] if len(params) > 1 else params[-1]
96
+ llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
97
+ llm = find_by_prefix(d=models_preset, key=llm_model_type)()
98
+
99
+ return llm, llm_model_name
@@ -0,0 +1,52 @@
1
+ import logging
2
+ import time
3
+
4
+ from bulk_chain.core.utils import format_model_name
5
+
6
+
7
+ class BaseLM(object):
8
+
9
+ def __init__(self, name=None, attempts=None, delay_sec=1, enable_log=True,
10
+ support_batching=False, **kwargs):
11
+
12
+ self.__name = name
13
+ self.__attempts = 1 if attempts is None else attempts
14
+ self.__delay_sec = delay_sec
15
+ self.__support_batching = support_batching
16
+
17
+ if enable_log:
18
+ self.__logger = logging.getLogger(__name__)
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+ def ask_core(self, batch):
22
+
23
+ for i in range(self.__attempts):
24
+ try:
25
+ if self.__support_batching:
26
+ # Launch in batch mode.
27
+ content = self.ask(batch)
28
+ else:
29
+ # Launch in non-batch mode.
30
+ assert len(batch) == 1, "The LM does not support batching," \
31
+ f" while size of the content is {len(batch)} which is not equal 1. " \
32
+ f"Please enable batch-supporting or set required inference settings."
33
+ content = batch[0]
34
+
35
+ response = self.ask(content)
36
+
37
+ # Wrapping into batch the response in the case of non-batching mode.
38
+ return response if self.__support_batching else [response]
39
+
40
+ except Exception as e:
41
+ if self.__logger is not None:
42
+ self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
43
+ self.__logger.info(e)
44
+ time.sleep(self.__delay_sec)
45
+
46
+ raise Exception("Can't infer")
47
+
48
+ def ask(self, content):
49
+ raise NotImplemented()
50
+
51
+ def name(self):
52
+ return format_model_name(self.__name)
@@ -12,6 +12,11 @@ class CmdArgsService:
12
12
  def iter_arguments(lst):
13
13
 
14
14
  def __release():
15
+
16
+ # We use the True value by default to treat the related parameter as flag.
17
+ if len(buf) == 0:
18
+ buf.append(True)
19
+
15
20
  return key, buf if len(buf) > 1 else buf[0]
16
21
 
17
22
  key = None
@@ -29,18 +34,37 @@ class CmdArgsService:
29
34
  buf.append(a)
30
35
 
31
36
  # Sharing the remaining params.
32
- if len(buf) > 0:
37
+ if key is not None:
33
38
  yield __release()
34
39
 
35
40
  @staticmethod
36
- def partition_list(lst, sep):
41
+ def __find_suffix_ind(lst, idx_from, end_prefix):
42
+ for i in range(idx_from, len(lst)):
43
+ if lst[i].startswith(end_prefix):
44
+ return i
45
+ return len(lst)
46
+
47
+ @staticmethod
48
+ def extract_native_args(lst, end_prefix):
49
+ return lst[:CmdArgsService.__find_suffix_ind(lst, idx_from=0, end_prefix=end_prefix)]
50
+
51
+ @staticmethod
52
+ def find_grouped_args(lst, starts_with, end_prefix):
37
53
  """Slices a list in two, cutting on index matching "sep"
38
54
  """
39
- if sep in lst:
40
- idx = lst.index(sep)
41
- return (lst[:idx], lst[idx+1:])
42
- else:
43
- return (lst[:], None)
55
+
56
+ # Checking the presence of starts_with.
57
+ # We have to return empty content in the case of absence starts_with in the lst.
58
+ if starts_with not in lst:
59
+ return []
60
+
61
+ # Assigning start index.
62
+ idx_from = lst.index(starts_with) + 1
63
+
64
+ # Assigning end index.
65
+ idx_to = CmdArgsService.__find_suffix_ind(lst, idx_from=idx_from, end_prefix=end_prefix)
66
+
67
+ return lst[idx_from:idx_to]
44
68
 
45
69
  @staticmethod
46
70
  def args_to_dict(args):
@@ -0,0 +1,51 @@
1
+ class BatchService(object):
2
+
3
+ @staticmethod
4
+ def handle_param_as_batch(batch, src_param, tgt_param, handle_func):
5
+ assert (isinstance(batch, list))
6
+ assert (isinstance(src_param, str))
7
+ assert (callable(handle_func))
8
+
9
+ _batch = [item[src_param] for item in batch]
10
+
11
+ # Do handling for the batch.
12
+ _handled_batch = handle_func(_batch)
13
+ assert (isinstance(_handled_batch, list))
14
+
15
+ # Apply changes.
16
+ for i, item in enumerate(batch):
17
+ item[tgt_param] = _handled_batch[i]
18
+
19
+
20
+ class BatchIterator:
21
+
22
+ def __init__(self, data_iter, batch_size, end_value=None):
23
+ assert(isinstance(batch_size, int) and batch_size > 0)
24
+ assert(callable(end_value) or end_value is None)
25
+ self.__data_iter = data_iter
26
+ self.__index = 0
27
+ self.__batch_size = batch_size
28
+ self.__end_value = end_value
29
+
30
+ def __iter__(self):
31
+ return self
32
+
33
+ def __next__(self):
34
+ buffer = []
35
+ while True:
36
+ try:
37
+ data = next(self.__data_iter)
38
+ except StopIteration:
39
+ break
40
+ buffer.append(data)
41
+ if len(buffer) == self.__batch_size:
42
+ break
43
+
44
+ if len(buffer) > 0:
45
+ self.__index += 1
46
+ return buffer
47
+
48
+ if self.__end_value is None:
49
+ raise StopIteration
50
+ else:
51
+ return self.__end_value()
@@ -20,3 +20,7 @@ class DataService(object):
20
20
  field_names = list(parse_fields_func(prompt))
21
21
  return DataService.compose_prompt_text(
22
22
  prompt=prompt, data_dict=data_dict, field_names=field_names)
23
+
24
+ @staticmethod
25
+ def limit_prompts(prompts_list, limit=None):
26
+ return [p[:limit] if limit is not None else p for p in prompts_list]
@@ -0,0 +1,10 @@
1
+ class DictionaryService:
2
+
3
+ @staticmethod
4
+ def custom_update(src_dict, other_dict):
5
+ for k, v in other_dict.items():
6
+ if k in src_dict:
7
+ raise Exception(f"The key `{k}` is already defined in both dicts with values: "
8
+ f"`{src_dict[k]}` (src) and `{v}` (other)")
9
+ src_dict[k] = v
10
+ return src_dict
@@ -1,8 +1,7 @@
1
- import logging
2
-
3
1
  from bulk_chain.core.llm_base import BaseLM
4
2
  from bulk_chain.core.service_data import DataService
5
3
  from bulk_chain.core.utils import iter_params
4
+ from bulk_chain.core.utils_logger import StreamedLogger
6
5
 
7
6
 
8
7
  def pad_str(text, pad):
@@ -23,29 +22,32 @@ def nice_output(text, width, pad=4, remove_new_line=False):
23
22
  return text_wrap(content=short_text, width=width, handle_line=lambda line: pad_str(line, pad=pad))
24
23
 
25
24
 
26
- def chat_with_lm(lm, chain=None, model_name=None):
25
+ def chat_with_lm(lm, preset_dict=None, chain=None, model_name=None, line_width=80, pad=0):
27
26
  assert (isinstance(lm, BaseLM))
28
27
  assert (isinstance(chain, list))
29
28
  assert (isinstance(model_name, str) or model_name is None)
30
29
 
31
- logger = logging.getLogger(__name__)
32
- logging.basicConfig(level=logging.INFO)
30
+ preset_dict = {} if preset_dict is None else preset_dict
31
+
32
+ streamed_logger = StreamedLogger(__name__)
33
33
 
34
34
  do_exit = False
35
35
  model_name = model_name if model_name is not None else "agent"
36
36
 
37
37
  while not do_exit:
38
38
 
39
- logger.info("----------------")
39
+ streamed_logger.info("----------------")
40
+ streamed_logger.info("\n")
40
41
 
41
42
  # Launching the CoT engine loop.
42
- data_dict = {}
43
- for prompt_args in chain:
43
+ data_dict = {} | preset_dict
44
+ for chain_ind, prompt_args in enumerate(chain):
44
45
 
45
46
  # Processing the prompt.
46
47
  prompt = prompt_args["prompt"]
47
48
 
48
49
  # Filling necessary parameters.
50
+ user_informed = False
49
51
  field_names = list(iter_params(prompt))
50
52
  for ind, f_name in enumerate(field_names):
51
53
 
@@ -54,6 +56,7 @@ def chat_with_lm(lm, chain=None, model_name=None):
54
56
 
55
57
  user_input = input(f"Enter your prompt for `{f_name}` ({ind+1}/{len(field_names)}) "
56
58
  f"(or 'exit' to quit): ")
59
+ user_informed = True
57
60
 
58
61
  if user_input.lower() == 'exit':
59
62
  do_exit = True
@@ -64,19 +67,37 @@ def chat_with_lm(lm, chain=None, model_name=None):
64
67
  if do_exit:
65
68
  break
66
69
 
70
+ # In the case of the initial interaction with the chain.
71
+ # we make sure that aware user for starting interaction.
72
+ if chain_ind == 0 and not user_informed:
73
+ user_input = input(f"Enter to continue (or 'exit' to quit) ...")
74
+ if user_input.lower() == 'exit':
75
+ do_exit = True
76
+
67
77
  # Finally asking LLM.
68
78
  DataService.compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
69
79
  actual_prompt = DataService.get_prompt_text(prompt=prompt, data_dict=data_dict)
70
80
 
71
81
  # Returning meta information, passed to LLM.
72
- pad = 4
73
- logger.info(pad_str(f"{model_name} (ask) ->", pad=pad))
74
- logger.info(nice_output(actual_prompt, pad=pad*2, remove_new_line=True, width=80))
82
+ streamed_logger.info(pad_str(f"{model_name} (ask [{chain_ind+1}/{len(chain)}]) ->", pad=pad))
83
+ streamed_logger.info("\n")
84
+ streamed_logger.info(nice_output(actual_prompt, pad=pad, remove_new_line=True, width=line_width))
85
+ streamed_logger.info("\n\n")
75
86
 
76
87
  # Response.
77
- response = lm.ask_safe(actual_prompt)
78
- logger.info(pad_str(f"{model_name} (resp)->", pad=pad))
79
- logger.info(nice_output(response, pad=pad*2, remove_new_line=False, width=80))
88
+ response = lm.ask_core(batch=[actual_prompt])[0]
89
+ streamed_logger.info(pad_str(f"{model_name} (resp [{chain_ind+1}/{len(chain)}])->", pad=pad))
90
+ streamed_logger.info("\n")
91
+ if isinstance(response, str):
92
+ streamed_logger.info(nice_output(response, pad=pad, remove_new_line=False, width=line_width))
93
+ buffer = [response]
94
+ else:
95
+ buffer = []
96
+ for chunk in response:
97
+ streamed_logger.info(chunk)
98
+ buffer.append(str(chunk))
99
+
100
+ streamed_logger.info("\n\n")
80
101
 
81
102
  # Collecting the answer for the next turn.
82
- data_dict[prompt_args["out"]] = response
103
+ data_dict[prompt_args["out"]] = "".join(buffer)
@@ -2,12 +2,11 @@ class SchemaService(object):
2
2
 
3
3
  def __init__(self, json_data):
4
4
  self.src = json_data
5
- self.name = self.src["name"]
6
5
  self.r2p, self.p2r, self.cot_args, self.chain = SchemaService.__init_schema(prompts=json_data["schema"])
7
6
 
8
7
  @classmethod
9
8
  def from_prompt(cls, prompt):
10
- prompt_schema = {"name": "prompt", "schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
9
+ prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
11
10
  return cls(prompt_schema)
12
11
 
13
12
  @staticmethod
@@ -0,0 +1,41 @@
1
+ import logging
2
+
3
+
4
+ def StreamedLogger(name: str) -> logging.Logger:
5
+ """ https://medium.com/@r.das699/optimizing-logging-practices-for-streaming-data-in-python-521798e1ed82
6
+ """
7
+ root_handlers = logging.getLogger().handlers
8
+ current_logger = logging.getLogger(name)
9
+ if not root_handlers:
10
+ new_handler = logging.StreamHandler()
11
+ new_handler.terminator = ""
12
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
13
+ current_logger.addHandler(new_handler)
14
+ current_logger.propagate = False
15
+ current_logger.setLevel(logging.INFO)
16
+ return current_logger
17
+
18
+ for handler in current_logger.handlers[:]:
19
+ current_logger.removeHandler(handler)
20
+
21
+ for handler_r in root_handlers:
22
+ if type(handler_r) is logging.StreamHandler:
23
+ new_handler = logging.StreamHandler()
24
+ new_handler.terminator = ""
25
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
26
+ current_logger.addHandler(new_handler)
27
+ elif type(handler_r) is logging.FileHandler:
28
+ new_handler = logging.FileHandler(
29
+ handler_r.baseFilename,
30
+ handler_r.mode,
31
+ handler_r.encoding,
32
+ handler_r.delay,
33
+ handler_r.errors,
34
+ )
35
+ new_handler.terminator = "" # This will stop the printing in new line
36
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
37
+ current_logger.addHandler(new_handler)
38
+ else:
39
+ continue
40
+ current_logger.propagate = False # Don't propagate to root logger
41
+ return current_logger