bulk-chain 0.25.0__tar.gz → 0.25.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/PKG-INFO +54 -16
  2. bulk_chain-0.25.2/README.md +127 -0
  3. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/api.py +22 -2
  4. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/llm_base.py +1 -1
  5. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_args.py +6 -1
  6. bulk_chain-0.25.2/bulk_chain/core/service_llm.py +94 -0
  7. bulk_chain-0.25.2/bulk_chain/core/utils_logger.py +41 -0
  8. bulk_chain-0.25.2/bulk_chain/demo.py +84 -0
  9. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/infer.py +5 -35
  10. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/PKG-INFO +54 -16
  11. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/SOURCES.txt +6 -1
  12. bulk_chain-0.25.2/bulk_chain.egg-info/requires.txt +2 -0
  13. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/setup.py +1 -1
  14. bulk_chain-0.25.2/test/test.py +62 -0
  15. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/test/test_api.py +5 -4
  16. bulk_chain-0.25.2/test/test_cmdargs.py +29 -0
  17. bulk_chain-0.25.2/test/test_provider_batching.py +31 -0
  18. bulk_chain-0.25.0/README.md +0 -91
  19. bulk_chain-0.25.0/bulk_chain/core/service_llm.py +0 -82
  20. bulk_chain-0.25.0/test/test_cmdargs.py +0 -20
  21. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/LICENSE +0 -0
  22. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/__init__.py +0 -0
  23. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/__init__.py +0 -0
  24. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_batch.py +0 -0
  25. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_data.py +0 -0
  26. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_dict.py +0 -0
  27. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_json.py +0 -0
  28. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/service_schema.py +0 -0
  29. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain/core/utils.py +0 -0
  30. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/dependency_links.txt +0 -0
  31. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/bulk_chain.egg-info/top_level.txt +0 -0
  32. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/setup.cfg +0 -0
  33. {bulk_chain-0.25.0 → bulk_chain-0.25.2}/test/test_args_seeking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 0.25.0
3
+ Version: 0.25.2
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -15,8 +15,10 @@ Classifier: Topic :: Text Processing :: Linguistic
15
15
  Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: source-iter==0.24.3
18
20
 
19
- # bulk-chain 0.25.0
21
+ # bulk-chain 0.25.2
20
22
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
21
23
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
22
24
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -26,7 +28,13 @@ License-File: LICENSE
26
28
  <img src="logo.png"/>
27
29
  </p>
28
30
 
29
- A lightweight, no-strings-attached **framework** for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
31
+ <p align="center">
32
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
33
+ <br>
34
+ <a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
35
+ </p>
36
+
37
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
30
38
 
31
39
  ### Main Features
32
40
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
@@ -42,7 +50,7 @@ A lightweight, no-strings-attached **framework** for your LLM that allows apply
42
50
  From PyPI:
43
51
 
44
52
  ```bash
45
- pip install bulk-chain
53
+ pip install --no-deps bulk-chain
46
54
  ```
47
55
 
48
56
  or latest version from here:
@@ -73,37 +81,67 @@ Below, is an example on how to declare your own schema:
73
81
  }
74
82
  ```
75
83
 
76
- Another templates are available [here](/ext/schema/).
77
-
78
84
  # Usage
79
85
 
80
86
  Preliminary steps:
81
87
 
82
88
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
83
- 2. Wrap or pick **LLM model** from the [list of presets](/ext/).
89
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
84
90
 
85
- ## API
91
+ ## Shell
86
92
 
87
- Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
93
+ ### Demo Mode
94
+
95
+ **demo mode** to interact with LLM via command line with LLM output streaming support.
96
+ The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
97
+
98
+ Quck start with launching demo:
99
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
100
+ 2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
101
+ 3. 🚀 Launch `demo.py` as follows:
102
+ ```bash
103
+ python3 -m bulk_chain.demo \
104
+ --schema "test/schema/thor_cot_schema.json" \
105
+ --adapter "dynamic:replicate_104.py:Replicate" \
106
+ %%m \
107
+ --model_name "meta/meta-llama-3-70b-instruct" \
108
+ --api_token "<REPLICATE-API-TOKEN>" \
109
+ --stream
110
+ ```
111
+
112
+ 📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
113
+ ![sa-bulk-chain-cot-final](https://github.com/user-attachments/assets/0cc8fdcb-6ddb-44a3-8f05-d76250ae6423)
88
114
 
89
- ## Shell
90
115
 
91
- > **NOTE:** You have to install `source-iter` package
116
+ ### Inference Mode
92
117
 
118
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
119
+
120
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
121
+ ```bash
122
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
123
+ ```
124
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
125
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
93
126
  ```bash
94
127
  python3 -m bulk_chain.infer \
95
128
  --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
96
- --schema "ext/schema/default.json" \
97
- --adapter "dynamic:ext/replicate.py:Replicate" \
129
+ --schema "test/schema/default.json" \
130
+ --adapter "replicate_104.py:Replicate" \
98
131
  %%m \
99
- --api_token "<REPLICATE-API-TOKEN>" \
100
- --temp 0.1
132
+ --model_name "deepseek-ai/deepseek-r1" \
133
+ --api_token "<REPLICATE-API-TOKEN>"
101
134
  ```
102
135
 
136
+ ## API
137
+
138
+ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
139
+
140
+
103
141
  # Embed your LLM
104
142
 
105
143
  All you have to do is to implement `BaseLM` class, that includes:
106
144
  * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
107
145
  * `ask(prompt)` -- infer your model with the given `prompt`.
108
146
 
109
- See examples with models [here](/ext).
147
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -0,0 +1,127 @@
1
+ # bulk-chain 0.25.2
2
+ ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
3
+ [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
4
+ [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
5
+ [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
6
+
7
+ <p align="center">
8
+ <img src="logo.png"/>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
13
+ <br>
14
+ <a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
15
+ </p>
16
+
17
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
18
+
19
+ ### Main Features
20
+ * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
21
+ * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
22
+ * ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
23
+
24
+ ### Extra Features
25
+ * ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
26
+
27
+
28
+ # Installation
29
+
30
+ From PyPI:
31
+
32
+ ```bash
33
+ pip install --no-deps bulk-chain
34
+ ```
35
+
36
+ or latest version from here:
37
+
38
+ ```bash
39
+ pip install git+https://github.com/nicolay-r/bulk-chain@master
40
+ ```
41
+
42
+ ## Chain-of-Thought Schema
43
+
44
+ To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
45
+ This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
46
+
47
+ Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
48
+ All the variable names are expected to be mentioned in `{}`.
49
+
50
+ Below, is an example on how to declare your own schema:
51
+
52
+ ```python
53
+ {
54
+ "name": "schema-name",
55
+ "schema": [
56
+ {"prompt": "Given the question '{text}', let's think step-by-step.",
57
+ "out": "steps"},
58
+ {"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
59
+ "out": "answer"},
60
+ ]
61
+ }
62
+ ```
63
+
64
+ # Usage
65
+
66
+ Preliminary steps:
67
+
68
+ 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
69
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
70
+
71
+ ## Shell
72
+
73
+ ### Demo Mode
74
+
75
+ **demo mode** to interact with LLM via command line with LLM output streaming support.
76
+ The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
77
+
78
+ Quck start with launching demo:
79
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
80
+ 2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
81
+ 3. 🚀 Launch `demo.py` as follows:
82
+ ```bash
83
+ python3 -m bulk_chain.demo \
84
+ --schema "test/schema/thor_cot_schema.json" \
85
+ --adapter "dynamic:replicate_104.py:Replicate" \
86
+ %%m \
87
+ --model_name "meta/meta-llama-3-70b-instruct" \
88
+ --api_token "<REPLICATE-API-TOKEN>" \
89
+ --stream
90
+ ```
91
+
92
+ 📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
93
+ ![sa-bulk-chain-cot-final](https://github.com/user-attachments/assets/0cc8fdcb-6ddb-44a3-8f05-d76250ae6423)
94
+
95
+
96
+ ### Inference Mode
97
+
98
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
99
+
100
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
101
+ ```bash
102
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
103
+ ```
104
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
105
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
106
+ ```bash
107
+ python3 -m bulk_chain.infer \
108
+ --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
109
+ --schema "test/schema/default.json" \
110
+ --adapter "replicate_104.py:Replicate" \
111
+ %%m \
112
+ --model_name "deepseek-ai/deepseek-r1" \
113
+ --api_token "<REPLICATE-API-TOKEN>"
114
+ ```
115
+
116
+ ## API
117
+
118
+ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
119
+
120
+
121
+ # Embed your LLM
122
+
123
+ All you have to do is to implement `BaseLM` class, that includes:
124
+ * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
125
+ * `ask(prompt)` -- infer your model with the given `prompt`.
126
+
127
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -7,7 +7,7 @@ from bulk_chain.core.service_data import DataService
7
7
  from bulk_chain.core.service_dict import DictionaryService
8
8
  from bulk_chain.core.service_json import JsonService
9
9
  from bulk_chain.core.service_schema import SchemaService
10
-
10
+ from bulk_chain.core.utils import dynamic_init, find_by_prefix
11
11
 
12
12
  INFER_MODES = {
13
13
  "default": lambda llm, prompt, limit_prompt=None: llm.ask_core(
@@ -76,4 +76,24 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, l
76
76
  schema=schema)
77
77
  for batch in BatchIterator(prompts_it, batch_size=batch_size))
78
78
 
79
- yield from content_it if return_batch else chain.from_iterable(content_it)
79
+ yield from content_it if return_batch else chain.from_iterable(content_it)
80
+
81
+
82
+ def init_llm(adapter, **model_kwargs):
83
+ """ This method perform dynamic initialization of LLM from third-party resource.
84
+ """
85
+
86
+ # List of the Supported models and their API wrappers.
87
+ models_preset = {
88
+ "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
89
+ class_name=llm_model_params)(**model_kwargs)
90
+ }
91
+
92
+ # Initialize LLM model.
93
+ params = adapter.split(':')
94
+ llm_model_type = params[0]
95
+ llm_model_name = params[1] if len(params) > 1 else params[-1]
96
+ llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
97
+ llm = find_by_prefix(d=models_preset, key=llm_model_type)()
98
+
99
+ return llm, llm_model_name
@@ -24,7 +24,7 @@ class BaseLM(object):
24
24
  try:
25
25
  if self.__support_batching:
26
26
  # Launch in batch mode.
27
- content = self.ask(batch)
27
+ content = batch
28
28
  else:
29
29
  # Launch in non-batch mode.
30
30
  assert len(batch) == 1, "The LM does not support batching," \
@@ -12,6 +12,11 @@ class CmdArgsService:
12
12
  def iter_arguments(lst):
13
13
 
14
14
  def __release():
15
+
16
+ # We use the True value by default to treat the related parameter as flag.
17
+ if len(buf) == 0:
18
+ buf.append(True)
19
+
15
20
  return key, buf if len(buf) > 1 else buf[0]
16
21
 
17
22
  key = None
@@ -29,7 +34,7 @@ class CmdArgsService:
29
34
  buf.append(a)
30
35
 
31
36
  # Sharing the remaining params.
32
- if len(buf) > 0:
37
+ if key is not None:
33
38
  yield __release()
34
39
 
35
40
  @staticmethod
@@ -0,0 +1,94 @@
1
+ from bulk_chain.core.llm_base import BaseLM
2
+ from bulk_chain.core.service_data import DataService
3
+ from bulk_chain.core.utils import iter_params
4
+ from bulk_chain.core.utils_logger import StreamedLogger
5
+
6
+
7
+ def pad_str(text, pad):
8
+ return text.rjust(len(text) + pad, ' ')
9
+
10
+
11
+ def nice_output(text, remove_new_line=False):
12
+ short_text = text.replace("\n", "") if remove_new_line else text
13
+ return short_text
14
+
15
+
16
+ def chat_with_lm(lm, preset_dict=None, chain=None, model_name=None, pad=0):
17
+ assert (isinstance(lm, BaseLM))
18
+ assert (isinstance(chain, list))
19
+ assert (isinstance(model_name, str) or model_name is None)
20
+
21
+ preset_dict = {} if preset_dict is None else preset_dict
22
+
23
+ streamed_logger = StreamedLogger(__name__)
24
+
25
+ do_exit = False
26
+ model_name = model_name if model_name is not None else "agent"
27
+
28
+ while not do_exit:
29
+
30
+ streamed_logger.info("----------------")
31
+ streamed_logger.info("\n")
32
+
33
+ # Launching the CoT engine loop.
34
+ data_dict = {} | preset_dict
35
+ for chain_ind, prompt_args in enumerate(chain):
36
+
37
+ # Processing the prompt.
38
+ prompt = prompt_args["prompt"]
39
+
40
+ # Filling necessary parameters.
41
+ user_informed = False
42
+ field_names = list(iter_params(prompt))
43
+ for ind, f_name in enumerate(field_names):
44
+
45
+ if f_name in data_dict:
46
+ continue
47
+
48
+ user_input = input(f"Enter your prompt for `{f_name}` ({ind+1}/{len(field_names)}) "
49
+ f"(or 'exit' to quit): ")
50
+ user_informed = True
51
+
52
+ if user_input.lower() == 'exit':
53
+ do_exit = True
54
+ break
55
+
56
+ data_dict[f_name] = user_input
57
+
58
+ if do_exit:
59
+ break
60
+
61
+ # In the case of the initial interaction with the chain.
62
+ # we make sure that aware user for starting interaction.
63
+ if chain_ind == 0 and not user_informed:
64
+ user_input = input(f"Enter to continue (or 'exit' to quit) ...")
65
+ if user_input.lower() == 'exit':
66
+ do_exit = True
67
+
68
+ # Finally asking LLM.
69
+ DataService.compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
70
+ actual_prompt = DataService.get_prompt_text(prompt=prompt, data_dict=data_dict)
71
+
72
+ # Returning meta information, passed to LLM.
73
+ streamed_logger.info(pad_str(f"{model_name} (ask [{chain_ind+1}/{len(chain)}]) ->", pad=pad))
74
+ streamed_logger.info("\n")
75
+ streamed_logger.info(nice_output(actual_prompt, remove_new_line=True))
76
+ streamed_logger.info("\n\n")
77
+
78
+ # Response.
79
+ response = lm.ask_core(batch=[actual_prompt])[0]
80
+ streamed_logger.info(pad_str(f"{model_name} (resp [{chain_ind+1}/{len(chain)}])->", pad=pad))
81
+ streamed_logger.info("\n")
82
+ if isinstance(response, str):
83
+ streamed_logger.info(nice_output(response, remove_new_line=False))
84
+ buffer = [response]
85
+ else:
86
+ buffer = []
87
+ for chunk in response:
88
+ streamed_logger.info(chunk)
89
+ buffer.append(str(chunk))
90
+
91
+ streamed_logger.info("\n\n")
92
+
93
+ # Collecting the answer for the next turn.
94
+ data_dict[prompt_args["out"]] = "".join(buffer)
@@ -0,0 +1,41 @@
1
+ import logging
2
+
3
+
4
+ def StreamedLogger(name: str) -> logging.Logger:
5
+ """ https://medium.com/@r.das699/optimizing-logging-practices-for-streaming-data-in-python-521798e1ed82
6
+ """
7
+ root_handlers = logging.getLogger().handlers
8
+ current_logger = logging.getLogger(name)
9
+ if not root_handlers:
10
+ new_handler = logging.StreamHandler()
11
+ new_handler.terminator = ""
12
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
13
+ current_logger.addHandler(new_handler)
14
+ current_logger.propagate = False
15
+ current_logger.setLevel(logging.INFO)
16
+ return current_logger
17
+
18
+ for handler in current_logger.handlers[:]:
19
+ current_logger.removeHandler(handler)
20
+
21
+ for handler_r in root_handlers:
22
+ if type(handler_r) is logging.StreamHandler:
23
+ new_handler = logging.StreamHandler()
24
+ new_handler.terminator = ""
25
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
26
+ current_logger.addHandler(new_handler)
27
+ elif type(handler_r) is logging.FileHandler:
28
+ new_handler = logging.FileHandler(
29
+ handler_r.baseFilename,
30
+ handler_r.mode,
31
+ handler_r.encoding,
32
+ handler_r.delay,
33
+ handler_r.errors,
34
+ )
35
+ new_handler.terminator = "" # This will stop the printing in new line
36
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
37
+ current_logger.addHandler(new_handler)
38
+ else:
39
+ continue
40
+ current_logger.propagate = False # Don't propagate to root logger
41
+ return current_logger
@@ -0,0 +1,84 @@
1
+ import json
2
+
3
+ import argparse
4
+ import logging
5
+ import sys
6
+
7
+ from source_iter.service_jsonl import JsonlService
8
+
9
+ from bulk_chain.api import init_llm
10
+ from bulk_chain.core.service_args import CmdArgsService
11
+ from bulk_chain.core.service_json import JsonService
12
+ from bulk_chain.core.service_llm import chat_with_lm
13
+ from bulk_chain.core.service_schema import SchemaService
14
+ from bulk_chain.core.utils import parse_filepath
15
+
16
+ logger = logging.getLogger(__name__)
17
+ logging.basicConfig(level=logging.INFO)
18
+
19
+
20
+ def iter_from_json(filepath):
21
+ with open(filepath, "r") as f:
22
+ content = json.load(f)
23
+ for key, value in content.items():
24
+ yield key, value
25
+
26
+
27
+ def iter_from_text_file(filepath):
28
+ with open(filepath, "r") as f:
29
+ yield filepath.split('.')[0], f.read()
30
+
31
+
32
+ if __name__ == '__main__':
33
+
34
+ parser = argparse.ArgumentParser(description="LLM demo usage based on CoT schema")
35
+ parser.add_argument('--adapter', dest='adapter', type=str, default=None)
36
+ parser.add_argument('--attempts', dest='attempts', type=int, default=None)
37
+ parser.add_argument('--src', dest='src', type=str, nargs="*", default=None)
38
+ parser.add_argument('--schema', dest='schema', type=str, default=None,
39
+ help="Path to the JSON file that describes schema")
40
+ parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
41
+ help="Optional trimming prompt by the specified amount of characters.")
42
+
43
+ # Extract native arguments.
44
+ native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
45
+ args = parser.parse_args(args=native_args[1:])
46
+
47
+ # Extract model-related arguments and Initialize Large Language Model.
48
+ model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
49
+ model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
50
+ llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
51
+
52
+ # Setup schema.
53
+ schema = SchemaService(json_data=JsonService.read(args.schema))
54
+ schema_name = schema.src.get("name", None)
55
+ if schema is not None:
56
+ logger.info(f"Using schema: {schema_name}")
57
+
58
+ output_providers = {
59
+ "jsonl": lambda filepath, data_it, header:
60
+ JsonlService.write(target=filepath,
61
+ data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
62
+ }
63
+
64
+ input_file_handlers = {
65
+ "json": lambda filepath: iter_from_json(filepath),
66
+ "txt": lambda filepath: iter_from_text_file(filepath)
67
+ }
68
+
69
+ # Input extension type defines the provider.
70
+ if args.src is None:
71
+ args.src = []
72
+ if isinstance(args.src, str):
73
+ args.src = [args.src]
74
+ sources = [parse_filepath(s) for s in args.src]
75
+
76
+ preset_dict = {}
77
+ for fp, ext, _ in sources:
78
+ for key, value in input_file_handlers[ext](fp):
79
+ if key in preset_dict:
80
+ raise Exception(f"While at handling {fp}: Key {key} is already registered!")
81
+ preset_dict[key] = value
82
+
83
+ # Launch Demo.
84
+ chat_with_lm(llm, preset_dict=preset_dict, chain=schema.chain, model_name=llm_model_name)
@@ -9,51 +9,28 @@ from source_iter.service_jsonl import JsonlService
9
9
  from source_iter.service_sqlite import SQLite3Service
10
10
  from tqdm import tqdm
11
11
 
12
- from bulk_chain.api import INFER_MODES, _infer_batch, CWD
12
+ from bulk_chain.api import INFER_MODES, _infer_batch, CWD, init_llm
13
13
  from bulk_chain.core.llm_base import BaseLM
14
14
  from bulk_chain.core.service_args import CmdArgsService
15
15
  from bulk_chain.core.service_dict import DictionaryService
16
16
  from bulk_chain.core.service_json import JsonService
17
- from bulk_chain.core.service_llm import chat_with_lm
18
17
  from bulk_chain.core.service_schema import SchemaService
19
- from bulk_chain.core.utils import dynamic_init, find_by_prefix, handle_table_name, optional_limit_iter, parse_filepath
18
+ from bulk_chain.core.utils import handle_table_name, optional_limit_iter, parse_filepath
20
19
 
21
20
  logger = logging.getLogger(__name__)
22
21
  logging.basicConfig(level=logging.INFO)
23
22
 
24
-
25
23
  WRITER_PROVIDERS = {
26
24
  "sqlite": lambda filepath, table_name, data_it, infer_data_func, **kwargs: SQLite3Service.write(
27
25
  data_it=data_it, target=filepath, table_name=table_name, data2col_func=infer_data_func,
28
26
  skip_existed=True, **kwargs)
29
27
  }
30
28
 
31
-
32
29
  READER_PROVIDERS = {
33
30
  "sqlite": lambda filepath, table_name: SQLite3Service.read(filepath, table=table_name)
34
31
  }
35
32
 
36
33
 
37
- def init_llm(**model_kwargs):
38
- """ This method perform dynamic initialization of LLM from third-party resource.
39
- """
40
-
41
- # List of the Supported models and their API wrappers.
42
- models_preset = {
43
- "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
44
- class_name=llm_model_params)(**model_kwargs)
45
- }
46
-
47
- # Initialize LLM model.
48
- params = args.adapter.split(':')
49
- llm_model_type = params[0]
50
- llm_model_name = params[1] if len(params) > 1 else params[-1]
51
- llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
52
- llm = find_by_prefix(d=models_preset, key=llm_model_type)()
53
-
54
- return llm, llm_model_name
55
-
56
-
57
34
  def iter_content_cached(input_dicts_it, llm, schema, cache_target, limit_prompt=None, **cache_kwargs):
58
35
  assert (isinstance(llm, BaseLM))
59
36
  assert (isinstance(cache_target, str))
@@ -91,9 +68,8 @@ if __name__ == '__main__':
91
68
 
92
69
  parser = argparse.ArgumentParser(description="Infer Instruct LLM inference based on CoT schema")
93
70
  parser.add_argument('--adapter', dest='adapter', type=str, default=None)
94
- parser.add_argument('--attempts', dest='attempts', type=int, default=None)
95
71
  parser.add_argument('--id-col', dest='id_col', type=str, default="uid")
96
- parser.add_argument('--src', dest='src', type=str, default=None)
72
+ parser.add_argument('--src', dest='src', type=str, nargs="?", default=None)
97
73
  parser.add_argument('--schema', dest='schema', type=str, default=None,
98
74
  help="Path to the JSON file that describes schema")
99
75
  parser.add_argument('--to', dest='to', type=str, default=None, choices=["csv", "sqlite"])
@@ -114,7 +90,7 @@ if __name__ == '__main__':
114
90
  # Extract model-related arguments and Initialize Large Language Model.
115
91
  model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
116
92
  model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
117
- llm, llm_model_name = init_llm(**model_args_dict)
93
+ llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
118
94
 
119
95
  # Setup schema.
120
96
  schema = SchemaService(json_data=JsonService.read(args.schema))
@@ -123,7 +99,6 @@ if __name__ == '__main__':
123
99
  logger.info(f"Using schema: {schema_name}")
124
100
 
125
101
  input_providers = {
126
- None: lambda _: chat_with_lm(llm, chain=schema.chain, model_name=llm_model_name),
127
102
  "csv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
128
103
  as_dict=True, skip_header=True,
129
104
  delimiter=csv_args_dict.get("delimiter", ","),
@@ -155,14 +130,9 @@ if __name__ == '__main__':
155
130
  args.output = args.output.format(model=llm.name()) if args.output is not None else args.output
156
131
  tgt_filepath, tgt_ext, tgt_meta = parse_filepath(args.output, default_ext=args.to)
157
132
 
158
- # Input extension type defines the provider.
133
+ # We do not support multiple files for other modes.
159
134
  src_filepath, src_ext, src_meta = parse_filepath(args.src)
160
135
 
161
- # Check whether we are in chat mode.
162
- if src_ext is None:
163
- input_providers[src_ext](None)
164
- exit(0)
165
-
166
136
  def default_output_file_template(ext):
167
137
  # This is a default template for output files to be generated.
168
138
  return "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema_name]), ext])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 0.25.0
3
+ Version: 0.25.2
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -15,8 +15,10 @@ Classifier: Topic :: Text Processing :: Linguistic
15
15
  Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: source-iter==0.24.3
18
20
 
19
- # bulk-chain 0.25.0
21
+ # bulk-chain 0.25.2
20
22
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
21
23
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
22
24
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -26,7 +28,13 @@ License-File: LICENSE
26
28
  <img src="logo.png"/>
27
29
  </p>
28
30
 
29
- A lightweight, no-strings-attached **framework** for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
31
+ <p align="center">
32
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
33
+ <br>
34
+ <a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
35
+ </p>
36
+
37
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
30
38
 
31
39
  ### Main Features
32
40
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
@@ -42,7 +50,7 @@ A lightweight, no-strings-attached **framework** for your LLM that allows apply
42
50
  From PyPI:
43
51
 
44
52
  ```bash
45
- pip install bulk-chain
53
+ pip install --no-deps bulk-chain
46
54
  ```
47
55
 
48
56
  or latest version from here:
@@ -73,37 +81,67 @@ Below, is an example on how to declare your own schema:
73
81
  }
74
82
  ```
75
83
 
76
- Another templates are available [here](/ext/schema/).
77
-
78
84
  # Usage
79
85
 
80
86
  Preliminary steps:
81
87
 
82
88
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
83
- 2. Wrap or pick **LLM model** from the [list of presets](/ext/).
89
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
84
90
 
85
- ## API
91
+ ## Shell
86
92
 
87
- Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
93
+ ### Demo Mode
94
+
95
+ **demo mode** to interact with LLM via command line with LLM output streaming support.
96
+ The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
97
+
98
+ Quck start with launching demo:
99
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
100
+ 2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
101
+ 3. 🚀 Launch `demo.py` as follows:
102
+ ```bash
103
+ python3 -m bulk_chain.demo \
104
+ --schema "test/schema/thor_cot_schema.json" \
105
+ --adapter "dynamic:replicate_104.py:Replicate" \
106
+ %%m \
107
+ --model_name "meta/meta-llama-3-70b-instruct" \
108
+ --api_token "<REPLICATE-API-TOKEN>" \
109
+ --stream
110
+ ```
111
+
112
+ 📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
113
+ ![sa-bulk-chain-cot-final](https://github.com/user-attachments/assets/0cc8fdcb-6ddb-44a3-8f05-d76250ae6423)
88
114
 
89
- ## Shell
90
115
 
91
- > **NOTE:** You have to install `source-iter` package
116
+ ### Inference Mode
92
117
 
118
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
119
+
120
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
121
+ ```bash
122
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
123
+ ```
124
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
125
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
93
126
  ```bash
94
127
  python3 -m bulk_chain.infer \
95
128
  --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
96
- --schema "ext/schema/default.json" \
97
- --adapter "dynamic:ext/replicate.py:Replicate" \
129
+ --schema "test/schema/default.json" \
130
+ --adapter "replicate_104.py:Replicate" \
98
131
  %%m \
99
- --api_token "<REPLICATE-API-TOKEN>" \
100
- --temp 0.1
132
+ --model_name "deepseek-ai/deepseek-r1" \
133
+ --api_token "<REPLICATE-API-TOKEN>"
101
134
  ```
102
135
 
136
+ ## API
137
+
138
+ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
139
+
140
+
103
141
  # Embed your LLM
104
142
 
105
143
  All you have to do is to implement `BaseLM` class, that includes:
106
144
  * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
107
145
  * `ask(prompt)` -- infer your model with the given `prompt`.
108
146
 
109
- See examples with models [here](/ext).
147
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -3,10 +3,12 @@ README.md
3
3
  setup.py
4
4
  bulk_chain/__init__.py
5
5
  bulk_chain/api.py
6
+ bulk_chain/demo.py
6
7
  bulk_chain/infer.py
7
8
  bulk_chain.egg-info/PKG-INFO
8
9
  bulk_chain.egg-info/SOURCES.txt
9
10
  bulk_chain.egg-info/dependency_links.txt
11
+ bulk_chain.egg-info/requires.txt
10
12
  bulk_chain.egg-info/top_level.txt
11
13
  bulk_chain/core/__init__.py
12
14
  bulk_chain/core/llm_base.py
@@ -18,6 +20,9 @@ bulk_chain/core/service_json.py
18
20
  bulk_chain/core/service_llm.py
19
21
  bulk_chain/core/service_schema.py
20
22
  bulk_chain/core/utils.py
23
+ bulk_chain/core/utils_logger.py
24
+ test/test.py
21
25
  test/test_api.py
22
26
  test/test_args_seeking.py
23
- test/test_cmdargs.py
27
+ test/test_cmdargs.py
28
+ test/test_provider_batching.py
@@ -0,0 +1,2 @@
1
+ tqdm
2
+ source-iter==0.24.3
@@ -15,7 +15,7 @@ def get_requirements(filenames):
15
15
 
16
16
  setup(
17
17
  name='bulk_chain',
18
- version='0.25.0',
18
+ version='0.25.2',
19
19
  python_requires=">=3.6",
20
20
  description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
21
21
  'ensuring reliable results for bulk input requests.',
@@ -0,0 +1,62 @@
1
+ import random
2
+ import time
3
+ import logging
4
+
5
+
6
+ def setup_logger_behaviour(name: str) -> logging.Logger:
7
+ root_handlers = logging.getLogger().handlers # gets root logger
8
+ current_logger = logging.getLogger(name) # gets current logger
9
+ if not root_handlers: # if root logger has no handlers then create streaming handeler only
10
+ new_handler = logging.StreamHandler()
11
+ new_handler.terminator = ""
12
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
13
+ current_logger.addHandler(new_handler)
14
+ current_logger.propagate = False
15
+ current_logger.setLevel(logging.INFO)
16
+ return current_logger
17
+
18
+ # Remove exixting Handlers from the current logger
19
+ for handler in current_logger.handlers[:]:
20
+ current_logger.removeHandler(handler)
21
+
22
+ for handler_r in root_handlers: # if root logger has handlers
23
+ if type(handler_r) is logging.StreamHandler: # if root logger has streaming handler
24
+ new_handler = logging.StreamHandler()
25
+ new_handler.terminator = "" # This will stop the printing in new line
26
+ new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
27
+ current_logger.addHandler(new_handler)
28
+ elif type(handler_r) is logging.FileHandler: # if root logger has file handler
29
+ new_handler = logging.FileHandler( # create new file handler
30
+ handler_r.baseFilename, # with same filename and other properties
31
+ handler_r.mode,
32
+ handler_r.encoding,
33
+ handler_r.delay,
34
+ handler_r.errors,
35
+ )
36
+ new_handler.terminator = "" # This will stop the printing in new line
37
+ new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
38
+ current_logger.addHandler(new_handler)
39
+ else:
40
+ continue
41
+ current_logger.propagate = False # Don't propagate to root logger
42
+ return current_logger
43
+
44
+ # Configure the logger
45
+ logger =logging.getLogger(__name__)
46
+ class FakeStreamingDataGenerator:
47
+
48
+ def stream_data(self):
49
+ while True:
50
+ data = random.randint(0, 100)
51
+ yield data
52
+ time.sleep(0.5)
53
+
54
+ # Example usage:
55
+ generator = FakeStreamingDataGenerator()
56
+ stream = generator.stream_data()
57
+
58
+ logger = setup_logger_behaviour(__name__) # call you set up function here
59
+ while True:
60
+ chunk = next(stream)
61
+ # Replacing print with logger
62
+ logger.info(chunk) # Best practice now
@@ -9,8 +9,9 @@ from bulk_chain.infer import iter_content_cached
9
9
  class TestAPI(unittest.TestCase):
10
10
 
11
11
  llm = dynamic_init(class_dir=join(CWD, ".."),
12
- class_filepath="ext/replicate.py",
13
- class_name="Replicate")(api_token="<API-KEY>")
12
+ class_filepath="providers/replicate_104.py",
13
+ class_name="Replicate")(api_token="<API-KEY>",
14
+ model_name="deepseek-ai/deepseek-r1")
14
15
 
15
16
  def it_data(self, n):
16
17
  for i in range(n):
@@ -19,7 +20,7 @@ class TestAPI(unittest.TestCase):
19
20
  def test_iter_cached(self):
20
21
  data_it = iter_content_cached(input_dicts_it=self.it_data(20),
21
22
  llm=self.llm,
22
- schema="../ext/schema/default.json",
23
+ schema="../schema/default.json",
23
24
  # Cache-related extra parameters.
24
25
  cache_target="out.sqlite:content",
25
26
  id_column_name="ind")
@@ -32,7 +33,7 @@ class TestAPI(unittest.TestCase):
32
33
  llm=self.llm,
33
34
  batch_size=1,
34
35
  return_batch=True,
35
- schema="../ext/schema/default.json")
36
+ schema="../schema/default.json")
36
37
 
37
38
  for data in data_it:
38
39
  print(data)
@@ -0,0 +1,29 @@
1
+ import sys
2
+ import unittest
3
+
4
+ from bulk_chain.core.service_args import CmdArgsService
5
+
6
+
7
+ class TestCmdArgs(unittest.TestCase):
8
+
9
+ def test(self):
10
+
11
+ # Csv-related.
12
+ csv_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%csv", end_prefix="%%")
13
+ print(csv_args)
14
+ csv_args = CmdArgsService.args_to_dict(csv_args)
15
+ print("csv\t", csv_args)
16
+
17
+ # Model-related.
18
+ m_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%m", end_prefix="%%")
19
+ m_args = CmdArgsService.args_to_dict(m_args)
20
+ print("mod\t", m_args)
21
+
22
+ # native.
23
+ n_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
24
+ n_args = CmdArgsService.args_to_dict(n_args)
25
+ print("nat\t", n_args)
26
+
27
+
28
+ if __name__ == '__main__':
29
+ unittest.main()
@@ -0,0 +1,31 @@
1
+ import unittest
2
+ from os.path import join
3
+
4
+ from tqdm import tqdm
5
+
6
+ from bulk_chain.api import CWD, iter_content
7
+ from bulk_chain.core.utils import dynamic_init
8
+ from utils import iter_test_jsonl_samples
9
+
10
+
11
+ class TestProviderBatching(unittest.TestCase):
12
+
13
+ llm = dynamic_init(class_dir=join(CWD, ".."),
14
+ class_filepath="providers/transformers_flan_t5.py",
15
+ class_name="FlanT5")(model_name="nicolay-r/flan-t5-tsa-thor-base",
16
+ max_new_tokens=128)
17
+
18
+ def test_iter(self):
19
+ input_dicts_it = iter_test_jsonl_samples()
20
+ data_it = iter_content(input_dicts_it=input_dicts_it,
21
+ llm=self.llm,
22
+ batch_size=20,
23
+ return_batch=False,
24
+ schema="schema/default.json")
25
+
26
+ for item in tqdm(data_it):
27
+ print(item)
28
+
29
+
30
+ if __name__ == '__main__':
31
+ unittest.main()
@@ -1,91 +0,0 @@
1
- # bulk-chain 0.25.0
2
- ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
3
- [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
4
- [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
5
- [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
6
-
7
- <p align="center">
8
- <img src="logo.png"/>
9
- </p>
10
-
11
- A lightweight, no-strings-attached **framework** for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
12
-
13
- ### Main Features
14
- * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
15
- * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
16
- * ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
17
-
18
- ### Extra Features
19
- * ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
20
-
21
-
22
- # Installation
23
-
24
- From PyPI:
25
-
26
- ```bash
27
- pip install bulk-chain
28
- ```
29
-
30
- or latest version from here:
31
-
32
- ```bash
33
- pip install git+https://github.com/nicolay-r/bulk-chain@master
34
- ```
35
-
36
- ## Chain-of-Thought Schema
37
-
38
- To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
39
- This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
40
-
41
- Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
42
- All the variable names are expected to be mentioned in `{}`.
43
-
44
- Below, is an example on how to declare your own schema:
45
-
46
- ```python
47
- {
48
- "name": "schema-name",
49
- "schema": [
50
- {"prompt": "Given the question '{text}', let's think step-by-step.",
51
- "out": "steps"},
52
- {"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
53
- "out": "answer"},
54
- ]
55
- }
56
- ```
57
-
58
- Another templates are available [here](/ext/schema/).
59
-
60
- # Usage
61
-
62
- Preliminary steps:
63
-
64
- 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
65
- 2. Wrap or pick **LLM model** from the [list of presets](/ext/).
66
-
67
- ## API
68
-
69
- Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
70
-
71
- ## Shell
72
-
73
- > **NOTE:** You have to install `source-iter` package
74
-
75
- ```bash
76
- python3 -m bulk_chain.infer \
77
- --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
78
- --schema "ext/schema/default.json" \
79
- --adapter "dynamic:ext/replicate.py:Replicate" \
80
- %%m \
81
- --api_token "<REPLICATE-API-TOKEN>" \
82
- --temp 0.1
83
- ```
84
-
85
- # Embed your LLM
86
-
87
- All you have to do is to implement `BaseLM` class, that includes:
88
- * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
89
- * `ask(prompt)` -- infer your model with the given `prompt`.
90
-
91
- See examples with models [here](/ext).
@@ -1,82 +0,0 @@
1
- import logging
2
-
3
- from bulk_chain.core.llm_base import BaseLM
4
- from bulk_chain.core.service_data import DataService
5
- from bulk_chain.core.utils import iter_params
6
-
7
-
8
- def pad_str(text, pad):
9
- return text.rjust(len(text) + pad, ' ')
10
-
11
-
12
- def text_wrap(content, width, handle_line=lambda l: l):
13
- lines = []
14
- for text in content.split('\n'):
15
- for i in range(0, len(text), width):
16
- line = handle_line(text[i:i + width])
17
- lines.append(line)
18
- return '\n'.join(lines)
19
-
20
-
21
- def nice_output(text, width, pad=4, remove_new_line=False):
22
- short_text = text.replace("\n", "") if remove_new_line else text
23
- return text_wrap(content=short_text, width=width, handle_line=lambda line: pad_str(line, pad=pad))
24
-
25
-
26
- def chat_with_lm(lm, chain=None, model_name=None):
27
- assert (isinstance(lm, BaseLM))
28
- assert (isinstance(chain, list))
29
- assert (isinstance(model_name, str) or model_name is None)
30
-
31
- logger = logging.getLogger(__name__)
32
- logging.basicConfig(level=logging.INFO)
33
-
34
- do_exit = False
35
- model_name = model_name if model_name is not None else "agent"
36
-
37
- while not do_exit:
38
-
39
- logger.info("----------------")
40
-
41
- # Launching the CoT engine loop.
42
- data_dict = {}
43
- for prompt_args in chain:
44
-
45
- # Processing the prompt.
46
- prompt = prompt_args["prompt"]
47
-
48
- # Filling necessary parameters.
49
- field_names = list(iter_params(prompt))
50
- for ind, f_name in enumerate(field_names):
51
-
52
- if f_name in data_dict:
53
- continue
54
-
55
- user_input = input(f"Enter your prompt for `{f_name}` ({ind+1}/{len(field_names)}) "
56
- f"(or 'exit' to quit): ")
57
-
58
- if user_input.lower() == 'exit':
59
- do_exit = True
60
- break
61
-
62
- data_dict[f_name] = user_input
63
-
64
- if do_exit:
65
- break
66
-
67
- # Finally asking LLM.
68
- DataService.compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
69
- actual_prompt = DataService.get_prompt_text(prompt=prompt, data_dict=data_dict)
70
-
71
- # Returning meta information, passed to LLM.
72
- pad = 4
73
- logger.info(pad_str(f"{model_name} (ask) ->", pad=pad))
74
- logger.info(nice_output(actual_prompt, pad=pad*2, remove_new_line=True, width=80))
75
-
76
- # Response.
77
- response_batch = lm.ask_core(batch=[actual_prompt])
78
- logger.info(pad_str(f"{model_name} (resp)->", pad=pad))
79
- logger.info(nice_output(response_batch[0], pad=pad * 2, remove_new_line=False, width=80))
80
-
81
- # Collecting the answer for the next turn.
82
- data_dict[prompt_args["out"]] = response_batch[0]
@@ -1,20 +0,0 @@
1
- import sys
2
-
3
- from bulk_chain.core.service_args import CmdArgsService
4
-
5
-
6
- # Csv-related.
7
- csv_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%csv", end_prefix="%%")
8
- print(csv_args)
9
- csv_args = CmdArgsService.args_to_dict(csv_args)
10
- print("csv\t", csv_args)
11
-
12
- # Model-related.
13
- m_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%m", end_prefix="%%")
14
- m_args = CmdArgsService.args_to_dict(m_args)
15
- print("mod\t", m_args)
16
-
17
- # native.
18
- n_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
19
- n_args = CmdArgsService.args_to_dict(n_args)
20
- print("nat\t", n_args)
File without changes
File without changes