bulk-chain 0.25.2__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/PKG-INFO +4 -52
  2. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/README.md +3 -50
  3. bulk_chain-1.0.0/bulk_chain/api.py +143 -0
  4. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/llm_base.py +1 -3
  5. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_batch.py +4 -21
  6. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_data.py +9 -5
  7. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/utils.py +15 -25
  8. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/PKG-INFO +4 -52
  9. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/SOURCES.txt +1 -6
  10. bulk_chain-1.0.0/bulk_chain.egg-info/requires.txt +1 -0
  11. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/setup.py +1 -1
  12. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test_api.py +7 -16
  13. bulk_chain-1.0.0/test/test_api_streaming.py +52 -0
  14. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test_provider_batching.py +4 -3
  15. bulk_chain-0.25.2/bulk_chain/api.py +0 -99
  16. bulk_chain-0.25.2/bulk_chain/core/service_args.py +0 -72
  17. bulk_chain-0.25.2/bulk_chain/core/service_llm.py +0 -94
  18. bulk_chain-0.25.2/bulk_chain/core/utils_logger.py +0 -41
  19. bulk_chain-0.25.2/bulk_chain/demo.py +0 -84
  20. bulk_chain-0.25.2/bulk_chain/infer.py +0 -161
  21. bulk_chain-0.25.2/bulk_chain.egg-info/requires.txt +0 -2
  22. bulk_chain-0.25.2/test/test_cmdargs.py +0 -29
  23. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/LICENSE +0 -0
  24. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/__init__.py +0 -0
  25. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/__init__.py +0 -0
  26. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_dict.py +0 -0
  27. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_json.py +0 -0
  28. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain/core/service_schema.py +0 -0
  29. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/dependency_links.txt +0 -0
  30. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/bulk_chain.egg-info/top_level.txt +0 -0
  31. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/setup.cfg +0 -0
  32. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test.py +0 -0
  33. {bulk_chain-0.25.2 → bulk_chain-1.0.0}/test/test_args_seeking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 0.25.2
3
+ Version: 1.0.0
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -16,9 +16,8 @@ Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: tqdm
19
- Requires-Dist: source-iter==0.24.3
20
19
 
21
- # bulk-chain 0.25.2
20
+ # bulk-chain 1.0.0
22
21
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
23
22
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
24
23
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -31,7 +30,7 @@ Requires-Dist: source-iter==0.24.3
31
30
  <p align="center">
32
31
  <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
33
32
  <br>
34
- <a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
33
+ <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
35
34
  </p>
36
35
 
37
36
  A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
@@ -39,11 +38,7 @@ A no-strings-attached **framework** for your LLM that allows applying Chain-of-
39
38
  ### Main Features
40
39
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
41
40
  * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
42
- * ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
43
-
44
- ### Extra Features
45
- * ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
46
-
41
+ * ✅ **Provides iterator over infinite amount of input contexts**
47
42
 
48
43
  # Installation
49
44
 
@@ -88,51 +83,8 @@ Preliminary steps:
88
83
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
89
84
  2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
90
85
 
91
- ## Shell
92
-
93
- ### Demo Mode
94
-
95
- **demo mode** to interact with LLM via command line with LLM output streaming support.
96
- The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
97
-
98
- Quck start with launching demo:
99
- 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
100
- 2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
101
- 3. 🚀 Launch `demo.py` as follows:
102
- ```bash
103
- python3 -m bulk_chain.demo \
104
- --schema "test/schema/thor_cot_schema.json" \
105
- --adapter "dynamic:replicate_104.py:Replicate" \
106
- %%m \
107
- --model_name "meta/meta-llama-3-70b-instruct" \
108
- --api_token "<REPLICATE-API-TOKEN>" \
109
- --stream
110
- ```
111
-
112
- 📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
113
- ![sa-bulk-chain-cot-final](https://github.com/user-attachments/assets/0cc8fdcb-6ddb-44a3-8f05-d76250ae6423)
114
86
 
115
87
 
116
- ### Inference Mode
117
-
118
- > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
119
-
120
- 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
121
- ```bash
122
- wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
123
- ```
124
- 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
125
- 3. 🚀 Launch inference using `DeepSeek-R1`:
126
- ```bash
127
- python3 -m bulk_chain.infer \
128
- --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
129
- --schema "test/schema/default.json" \
130
- --adapter "replicate_104.py:Replicate" \
131
- %%m \
132
- --model_name "deepseek-ai/deepseek-r1" \
133
- --api_token "<REPLICATE-API-TOKEN>"
134
- ```
135
-
136
88
  ## API
137
89
 
138
90
  Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
@@ -1,4 +1,4 @@
1
- # bulk-chain 0.25.2
1
+ # bulk-chain 1.0.0
2
2
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
3
3
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
4
4
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -11,7 +11,7 @@
11
11
  <p align="center">
12
12
  <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
13
13
  <br>
14
- <a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
14
+ <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
15
15
  </p>
16
16
 
17
17
  A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
@@ -19,11 +19,7 @@ A no-strings-attached **framework** for your LLM that allows applying Chain-of-
19
19
  ### Main Features
20
20
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
21
21
  * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
22
- * ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
23
-
24
- ### Extra Features
25
- * ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
26
-
22
+ * ✅ **Provides iterator over infinite amount of input contexts**
27
23
 
28
24
  # Installation
29
25
 
@@ -68,51 +64,8 @@ Preliminary steps:
68
64
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
69
65
  2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
70
66
 
71
- ## Shell
72
-
73
- ### Demo Mode
74
-
75
- **demo mode** to interact with LLM via command line with LLM output streaming support.
76
- The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
77
-
78
- Quck start with launching demo:
79
- 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
80
- 2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
81
- 3. 🚀 Launch `demo.py` as follows:
82
- ```bash
83
- python3 -m bulk_chain.demo \
84
- --schema "test/schema/thor_cot_schema.json" \
85
- --adapter "dynamic:replicate_104.py:Replicate" \
86
- %%m \
87
- --model_name "meta/meta-llama-3-70b-instruct" \
88
- --api_token "<REPLICATE-API-TOKEN>" \
89
- --stream
90
- ```
91
-
92
- 📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
93
- ![sa-bulk-chain-cot-final](https://github.com/user-attachments/assets/0cc8fdcb-6ddb-44a3-8f05-d76250ae6423)
94
67
 
95
68
 
96
- ### Inference Mode
97
-
98
- > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
99
-
100
- 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
101
- ```bash
102
- wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
103
- ```
104
- 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
105
- 3. 🚀 Launch inference using `DeepSeek-R1`:
106
- ```bash
107
- python3 -m bulk_chain.infer \
108
- --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
109
- --schema "test/schema/default.json" \
110
- --adapter "replicate_104.py:Replicate" \
111
- %%m \
112
- --model_name "deepseek-ai/deepseek-r1" \
113
- --api_token "<REPLICATE-API-TOKEN>"
114
- ```
115
-
116
69
  ## API
117
70
 
118
71
  Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
@@ -0,0 +1,143 @@
1
+ import collections
2
+ import os
3
+ from itertools import chain
4
+
5
+ from bulk_chain.core.llm_base import BaseLM
6
+ from bulk_chain.core.service_batch import BatchIterator
7
+ from bulk_chain.core.service_data import DataService
8
+ from bulk_chain.core.service_dict import DictionaryService
9
+ from bulk_chain.core.service_json import JsonService
10
+ from bulk_chain.core.service_schema import SchemaService
11
+ from bulk_chain.core.utils import dynamic_init, find_by_prefix
12
+
13
+
14
+ INFER_MODES = {
15
+ "batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
16
+ DataService.limit_prompts(batch, limit=limit_prompt))
17
+ }
18
+
19
+
20
+ CWD = os.getcwd()
21
+
22
+
23
+ def _iter_entry_content(entry, entry_info=None, **kwargs):
24
+
25
+ if isinstance(entry, str):
26
+ kwargs.get("callback_str_func", lambda *_: None)(entry, entry_info)
27
+ yield entry
28
+ elif isinstance(entry, collections.abc.Iterable):
29
+ h = kwargs.get("callback_stream_func", lambda *_: None)
30
+ h(None, entry_info | {"action": "start"})
31
+ for chunk in map(lambda item: str(item), entry):
32
+ yield chunk
33
+ h(chunk, entry_info)
34
+ h(None, entry_info | {"action": "end"})
35
+ else:
36
+ raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
37
+
38
+
39
+ def _iter_batch_prompts(c, batch_content_it, **kwargs):
40
+ for ind_in_batch, entry in enumerate(batch_content_it):
41
+ content = DataService.get_prompt_text(
42
+ prompt=entry[c]["prompt"],
43
+ data_dict=entry,
44
+ handle_missed_func=kwargs["handle_missed_value_func"])
45
+ yield ind_in_batch, content
46
+
47
+
48
+ def _iter_batch_responses(p_column, c, batch_content_it, **kwargs):
49
+ p_batch = [item[p_column] for item in batch_content_it]
50
+ # TODO. This part could be async.
51
+ # TODO. ind_in_batch might be a part of the async return.
52
+ for ind_in_batch, entry in enumerate(kwargs["handle_batch_func"](p_batch)):
53
+ yield ind_in_batch, _iter_entry_content(entry=entry, entry_info={"ind": ind_in_batch, "param": c}, **kwargs)
54
+
55
+
56
+ def _infer_batch(batch, schema, return_mode, cols=None, **kwargs):
57
+ assert (isinstance(batch, list))
58
+
59
+ if len(batch) == 0:
60
+ return batch
61
+
62
+ if cols is None:
63
+ first_item = batch[0]
64
+ cols = list(first_item.keys()) if cols is None else cols
65
+
66
+ for c in cols:
67
+
68
+ # Handling prompt column.
69
+ if c in schema.p2r:
70
+ content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
71
+ for ind_in_batch, prompt in content_it:
72
+ batch[ind_in_batch][c] = prompt
73
+
74
+ # Handling column for inference.
75
+ if c in schema.r2p:
76
+ content_it = _iter_batch_responses(c=c, p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
77
+ for ind_in_batch, chunk_it in content_it:
78
+
79
+ chunks = []
80
+ for chunk in chunk_it:
81
+ chunks.append(chunk)
82
+
83
+ if return_mode == "chunk":
84
+ yield [ind_in_batch, c, chunk]
85
+
86
+ batch[ind_in_batch][c] = "".join(chunks)
87
+
88
+ if return_mode == "record":
89
+ for record in batch:
90
+ yield record
91
+
92
+ if return_mode == "batch":
93
+ yield batch
94
+
95
+
96
+ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None, return_mode="batch", **kwargs):
97
+ """ This method represent Python API aimed at application of `llm` towards
98
+ iterator of input_dicts via cache_target that refers to the SQLite using
99
+ the given `schema`
100
+ """
101
+ assert (return_mode in ["batch", "chunk"])
102
+ assert (isinstance(llm, BaseLM))
103
+
104
+ # Quick initialization of the schema.
105
+ if isinstance(schema, str):
106
+ schema = JsonService.read(schema)
107
+ if isinstance(schema, dict):
108
+ schema = SchemaService(json_data=schema)
109
+
110
+ prompts_it = map(
111
+ lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
112
+ input_dicts_it
113
+ )
114
+
115
+ content_it = (_infer_batch(batch=batch,
116
+ handle_batch_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
117
+ return_mode=return_mode,
118
+ schema=schema,
119
+ **kwargs)
120
+ for batch in BatchIterator(prompts_it, batch_size=batch_size))
121
+
122
+ yield from chain.from_iterable(content_it)
123
+
124
+
125
+ def init_llm(adapter, **model_kwargs):
126
+ """ This method perform dynamic initialization of LLM from third-party resource.
127
+ """
128
+ assert (isinstance(adapter, str))
129
+
130
+ # List of the Supported models and their API wrappers.
131
+ models_preset = {
132
+ "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
133
+ class_name=llm_model_params)(**model_kwargs)
134
+ }
135
+
136
+ # Initialize LLM model.
137
+ params = adapter.split(':')
138
+ llm_model_type = params[0]
139
+ llm_model_name = params[1] if len(params) > 1 else params[-1]
140
+ llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
141
+ llm = find_by_prefix(d=models_preset, key=llm_model_type)()
142
+
143
+ return llm, llm_model_name
@@ -1,8 +1,6 @@
1
1
  import logging
2
2
  import time
3
3
 
4
- from bulk_chain.core.utils import format_model_name
5
-
6
4
 
7
5
  class BaseLM(object):
8
6
 
@@ -49,4 +47,4 @@ class BaseLM(object):
49
47
  raise NotImplemented()
50
48
 
51
49
  def name(self):
52
- return format_model_name(self.__name)
50
+ return self.__name.replace("/", "_")
@@ -1,31 +1,13 @@
1
- class BatchService(object):
2
-
3
- @staticmethod
4
- def handle_param_as_batch(batch, src_param, tgt_param, handle_func):
5
- assert (isinstance(batch, list))
6
- assert (isinstance(src_param, str))
7
- assert (callable(handle_func))
8
-
9
- _batch = [item[src_param] for item in batch]
10
-
11
- # Do handling for the batch.
12
- _handled_batch = handle_func(_batch)
13
- assert (isinstance(_handled_batch, list))
14
-
15
- # Apply changes.
16
- for i, item in enumerate(batch):
17
- item[tgt_param] = _handled_batch[i]
18
-
19
-
20
1
  class BatchIterator:
21
2
 
22
- def __init__(self, data_iter, batch_size, end_value=None):
3
+ def __init__(self, data_iter, batch_size, end_value=None, filter_func=None):
23
4
  assert(isinstance(batch_size, int) and batch_size > 0)
24
5
  assert(callable(end_value) or end_value is None)
25
6
  self.__data_iter = data_iter
26
7
  self.__index = 0
27
8
  self.__batch_size = batch_size
28
9
  self.__end_value = end_value
10
+ self.__filter_func = (lambda _: True) if filter_func is None else filter_func
29
11
 
30
12
  def __iter__(self):
31
13
  return self
@@ -37,7 +19,8 @@ class BatchIterator:
37
19
  data = next(self.__data_iter)
38
20
  except StopIteration:
39
21
  break
40
- buffer.append(data)
22
+ if self.__filter_func(data):
23
+ buffer.append(data)
41
24
  if len(buffer) == self.__batch_size:
42
25
  break
43
26
 
@@ -4,8 +4,8 @@ from bulk_chain.core.utils import iter_params
4
4
  class DataService(object):
5
5
 
6
6
  @staticmethod
7
- def compose_prompt_text(prompt, data_dict, field_names):
8
- assert(isinstance(data_dict, dict))
7
+ def __compose_prompt_text(prompt, data_dict, field_names):
8
+ assert (isinstance(data_dict, dict))
9
9
  fmt_d = {col_name: data_dict[col_name] for col_name in field_names}
10
10
 
11
11
  # Guarantee that items has correct type.
@@ -16,10 +16,14 @@ class DataService(object):
16
16
  return prompt.format(**fmt_d)
17
17
 
18
18
  @staticmethod
19
- def get_prompt_text(prompt, data_dict, parse_fields_func=iter_params):
19
+ def get_prompt_text(prompt, data_dict, parse_fields_func=iter_params, handle_missed_func=None):
20
20
  field_names = list(parse_fields_func(prompt))
21
- return DataService.compose_prompt_text(
22
- prompt=prompt, data_dict=data_dict, field_names=field_names)
21
+
22
+ for col_name in field_names:
23
+ if col_name not in data_dict:
24
+ data_dict[col_name] = handle_missed_func(col_name)
25
+
26
+ return DataService.__compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
23
27
 
24
28
  @staticmethod
25
29
  def limit_prompts(prompts_list, limit=None):
@@ -2,6 +2,7 @@ import importlib
2
2
  import logging
3
3
  import sys
4
4
  from collections import Counter
5
+ from os.path import dirname, join, basename
5
6
 
6
7
  logger = logging.getLogger(__name__)
7
8
  logging.basicConfig(level=logging.INFO)
@@ -47,28 +48,6 @@ def iter_params(text):
47
48
  beg = pe+1
48
49
 
49
50
 
50
- def format_model_name(name):
51
- return name.replace("/", "_")
52
-
53
-
54
- def parse_filepath(filepath, default_filepath=None, default_ext=None):
55
- """ This is an auxiliary function for handling sources and targets from cmd string.
56
- """
57
- if filepath is None:
58
- return default_filepath, default_ext, None
59
- info = filepath.split(":")
60
- filepath = info[0]
61
- meta = info[1] if len(info) > 1 else None
62
- ext = filepath.split('.')[-1] if default_ext is None else default_ext
63
- return filepath, ext, meta
64
-
65
-
66
- def handle_table_name(name):
67
- return name.\
68
- replace('-', '_').\
69
- replace('.', "_")
70
-
71
-
72
51
  def auto_import(name, is_class=False):
73
52
  """ Import from the external python packages.
74
53
  """
@@ -82,13 +61,24 @@ def auto_import(name, is_class=False):
82
61
 
83
62
 
84
63
  def dynamic_init(class_dir, class_filepath, class_name=None):
85
- sys.path.append(class_dir)
64
+
65
+ # Registering path.
66
+ target = join(class_dir, dirname(class_filepath))
67
+ logger.info(f"Adding sys path for `{target}`")
68
+ sys.path.insert(1, target)
86
69
  class_path_list = class_filepath.split('/')
87
- class_path_list[-1] = '.'.join(class_path_list[-1].split('.')[:-1])
70
+
71
+ # Composing proper class name.
72
+ class_filename = basename(class_path_list[-1])
73
+ if class_filename.endswith(".py"):
74
+ class_filename = class_filename[:-len(".py")]
75
+
76
+ # Loading library.
88
77
  class_name = class_path_list[-1].title() if class_name is None else class_name
89
- class_path = ".".join(class_path_list + [class_name])
78
+ class_path = ".".join([class_filename, class_name])
90
79
  logger.info(f"Dynamic loading for the file and class `{class_path}`")
91
80
  cls = auto_import(class_path, is_class=False)
81
+
92
82
  return cls
93
83
 
94
84
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 0.25.2
3
+ Version: 1.0.0
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -16,9 +16,8 @@ Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: tqdm
19
- Requires-Dist: source-iter==0.24.3
20
19
 
21
- # bulk-chain 0.25.2
20
+ # bulk-chain 1.0.0
22
21
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
23
22
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
24
23
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -31,7 +30,7 @@ Requires-Dist: source-iter==0.24.3
31
30
  <p align="center">
32
31
  <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
33
32
  <br>
34
- <a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
33
+ <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
35
34
  </p>
36
35
 
37
36
  A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
@@ -39,11 +38,7 @@ A no-strings-attached **framework** for your LLM that allows applying Chain-of-
39
38
  ### Main Features
40
39
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
41
40
  * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
42
- * ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
43
-
44
- ### Extra Features
45
- * ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
46
-
41
+ * ✅ **Provides iterator over infinite amount of input contexts**
47
42
 
48
43
  # Installation
49
44
 
@@ -88,51 +83,8 @@ Preliminary steps:
88
83
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
89
84
  2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
90
85
 
91
- ## Shell
92
-
93
- ### Demo Mode
94
-
95
- **demo mode** to interact with LLM via command line with LLM output streaming support.
96
- The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
97
-
98
- Quck start with launching demo:
99
- 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
100
- 2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
101
- 3. 🚀 Launch `demo.py` as follows:
102
- ```bash
103
- python3 -m bulk_chain.demo \
104
- --schema "test/schema/thor_cot_schema.json" \
105
- --adapter "dynamic:replicate_104.py:Replicate" \
106
- %%m \
107
- --model_name "meta/meta-llama-3-70b-instruct" \
108
- --api_token "<REPLICATE-API-TOKEN>" \
109
- --stream
110
- ```
111
-
112
- 📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
113
- ![sa-bulk-chain-cot-final](https://github.com/user-attachments/assets/0cc8fdcb-6ddb-44a3-8f05-d76250ae6423)
114
86
 
115
87
 
116
- ### Inference Mode
117
-
118
- > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
119
-
120
- 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
121
- ```bash
122
- wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
123
- ```
124
- 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
125
- 3. 🚀 Launch inference using `DeepSeek-R1`:
126
- ```bash
127
- python3 -m bulk_chain.infer \
128
- --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
129
- --schema "test/schema/default.json" \
130
- --adapter "replicate_104.py:Replicate" \
131
- %%m \
132
- --model_name "deepseek-ai/deepseek-r1" \
133
- --api_token "<REPLICATE-API-TOKEN>"
134
- ```
135
-
136
88
  ## API
137
89
 
138
90
  Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
@@ -3,8 +3,6 @@ README.md
3
3
  setup.py
4
4
  bulk_chain/__init__.py
5
5
  bulk_chain/api.py
6
- bulk_chain/demo.py
7
- bulk_chain/infer.py
8
6
  bulk_chain.egg-info/PKG-INFO
9
7
  bulk_chain.egg-info/SOURCES.txt
10
8
  bulk_chain.egg-info/dependency_links.txt
@@ -12,17 +10,14 @@ bulk_chain.egg-info/requires.txt
12
10
  bulk_chain.egg-info/top_level.txt
13
11
  bulk_chain/core/__init__.py
14
12
  bulk_chain/core/llm_base.py
15
- bulk_chain/core/service_args.py
16
13
  bulk_chain/core/service_batch.py
17
14
  bulk_chain/core/service_data.py
18
15
  bulk_chain/core/service_dict.py
19
16
  bulk_chain/core/service_json.py
20
- bulk_chain/core/service_llm.py
21
17
  bulk_chain/core/service_schema.py
22
18
  bulk_chain/core/utils.py
23
- bulk_chain/core/utils_logger.py
24
19
  test/test.py
25
20
  test/test_api.py
21
+ test/test_api_streaming.py
26
22
  test/test_args_seeking.py
27
- test/test_cmdargs.py
28
23
  test/test_provider_batching.py
@@ -0,0 +1 @@
1
+ tqdm
@@ -15,7 +15,7 @@ def get_requirements(filenames):
15
15
 
16
16
  setup(
17
17
  name='bulk_chain',
18
- version='0.25.2',
18
+ version='1.0.0',
19
19
  python_requires=">=3.6",
20
20
  description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
21
21
  'ensuring reliable results for bulk input requests.',
@@ -3,37 +3,28 @@ from os.path import join
3
3
 
4
4
  from bulk_chain.api import iter_content, CWD
5
5
  from bulk_chain.core.utils import dynamic_init
6
- from bulk_chain.infer import iter_content_cached
6
+ from utils import current_dir, API_TOKEN
7
7
 
8
8
 
9
9
  class TestAPI(unittest.TestCase):
10
10
 
11
11
  llm = dynamic_init(class_dir=join(CWD, ".."),
12
12
  class_filepath="providers/replicate_104.py",
13
- class_name="Replicate")(api_token="<API-KEY>",
13
+ class_name="Replicate")(api_token=API_TOKEN,
14
14
  model_name="deepseek-ai/deepseek-r1")
15
15
 
16
- def it_data(self, n):
16
+ @staticmethod
17
+ def it_data(n):
17
18
  for i in range(n):
18
19
  yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
19
20
 
20
- def test_iter_cached(self):
21
- data_it = iter_content_cached(input_dicts_it=self.it_data(20),
22
- llm=self.llm,
23
- schema="../schema/default.json",
24
- # Cache-related extra parameters.
25
- cache_target="out.sqlite:content",
26
- id_column_name="ind")
27
-
28
- for data in data_it:
29
- print(data)
30
-
31
21
  def test_iter(self):
32
22
  data_it = iter_content(input_dicts_it=self.it_data(20),
33
23
  llm=self.llm,
34
24
  batch_size=1,
35
- return_batch=True,
36
- schema="../schema/default.json")
25
+ handle_missed_value_func=lambda *_: None,
26
+ return_mode="batch",
27
+ schema=join(current_dir, "schema/default.json"))
37
28
 
38
29
  for data in data_it:
39
30
  print(data)