bulk-chain 1.0.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bulk_chain/api.py CHANGED
@@ -1,41 +1,36 @@
1
+ import asyncio
1
2
  import collections
3
+ import logging
2
4
  import os
3
5
  from itertools import chain
6
+ from types import AsyncGeneratorType
4
7
 
5
8
  from bulk_chain.core.llm_base import BaseLM
9
+ from bulk_chain.core.service_asyncio import AsyncioService
6
10
  from bulk_chain.core.service_batch import BatchIterator
7
11
  from bulk_chain.core.service_data import DataService
8
12
  from bulk_chain.core.service_dict import DictionaryService
9
13
  from bulk_chain.core.service_json import JsonService
10
14
  from bulk_chain.core.service_schema import SchemaService
11
- from bulk_chain.core.utils import dynamic_init, find_by_prefix
15
+ from bulk_chain.core.utils import attempt_wrapper
12
16
 
13
17
 
14
18
  INFER_MODES = {
15
- "batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
16
- DataService.limit_prompts(batch, limit=limit_prompt))
19
+ "single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
20
+ "batch": lambda llm, batch, **kwargs: llm.ask_batch(batch),
21
+ "single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
22
+ "batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
23
+ batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
24
+ ),
25
+ "batch_stream_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
26
+ batch=batch, async_handler=llm.ask_stream_async, event_loop=kwargs.get("event_loop")
27
+ ),
17
28
  }
18
29
 
19
30
 
20
31
  CWD = os.getcwd()
21
32
 
22
33
 
23
- def _iter_entry_content(entry, entry_info=None, **kwargs):
24
-
25
- if isinstance(entry, str):
26
- kwargs.get("callback_str_func", lambda *_: None)(entry, entry_info)
27
- yield entry
28
- elif isinstance(entry, collections.abc.Iterable):
29
- h = kwargs.get("callback_stream_func", lambda *_: None)
30
- h(None, entry_info | {"action": "start"})
31
- for chunk in map(lambda item: str(item), entry):
32
- yield chunk
33
- h(chunk, entry_info)
34
- h(None, entry_info | {"action": "end"})
35
- else:
36
- raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
37
-
38
-
39
34
  def _iter_batch_prompts(c, batch_content_it, **kwargs):
40
35
  for ind_in_batch, entry in enumerate(batch_content_it):
41
36
  content = DataService.get_prompt_text(
@@ -45,19 +40,63 @@ def _iter_batch_prompts(c, batch_content_it, **kwargs):
45
40
  yield ind_in_batch, content
46
41
 
47
42
 
48
- def _iter_batch_responses(p_column, c, batch_content_it, **kwargs):
43
+ def __handle_agen_to_gen(handle, batch, event_loop):
44
+ """ This handler provides conversion of the async generator to generator (sync).
45
+ """
46
+
47
+ def __wrap_with_index(async_gens):
48
+ async def wrapper(index, agen):
49
+ async for item in agen:
50
+ yield index, item
51
+ return [wrapper(i, agen) for i, agen in enumerate(async_gens)]
52
+
53
+ agen_list = handle(batch, event_loop=event_loop)
54
+
55
+ it = AsyncioService.async_gen_to_iter(
56
+ gen=AsyncioService.merge_generators(*__wrap_with_index(agen_list)),
57
+ loop=event_loop)
58
+
59
+ for ind_in_batch, chunk in it:
60
+ yield ind_in_batch, str(chunk)
61
+
62
+
63
+ def __handle_gen(handle, batch, event_loop):
64
+ """ This handler deals with the iteration of each individual element of the batch.
65
+ """
66
+
67
+ def _iter_entry_content(entry):
68
+ if isinstance(entry, str):
69
+ yield entry
70
+ elif isinstance(entry, collections.abc.Iterable):
71
+ for chunk in map(lambda item: str(item), entry):
72
+ yield chunk
73
+ elif isinstance(entry, AsyncGeneratorType):
74
+ for chunk in AsyncioService.async_gen_to_iter(entry, loop=event_loop):
75
+ yield str(chunk)
76
+ else:
77
+ raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
78
+
79
+ for ind_in_batch, entry in enumerate(handle(batch, event_loop=event_loop)):
80
+ for chunk in _iter_entry_content(entry=entry):
81
+ yield ind_in_batch, chunk
82
+
83
+
84
+ def _iter_chunks(p_column, batch_content_it, **kwargs):
85
+ handler = __handle_agen_to_gen if kwargs["infer_mode"] == "batch_stream_async" else __handle_gen
49
86
  p_batch = [item[p_column] for item in batch_content_it]
50
- # TODO. This part could be async.
51
- # TODO. ind_in_batch might be a part of the async return.
52
- for ind_in_batch, entry in enumerate(kwargs["handle_batch_func"](p_batch)):
53
- yield ind_in_batch, _iter_entry_content(entry=entry, entry_info={"ind": ind_in_batch, "param": c}, **kwargs)
87
+ it = handler(handle=kwargs["handle_batch_func"], batch=p_batch, event_loop=kwargs["event_loop"])
88
+ for ind_in_batch, chunk in it:
89
+ yield ind_in_batch, chunk
54
90
 
55
91
 
56
- def _infer_batch(batch, schema, return_mode, cols=None, **kwargs):
92
+ def _column_ordered_chunks_iter(batch, schema, cols=None, keep_prompts=True, **kwargs):
93
+ """
94
+ NOTE: we populate `batch` content automatically
95
+ """
57
96
  assert (isinstance(batch, list))
58
97
 
59
98
  if len(batch) == 0:
60
- return batch
99
+ return
61
100
 
62
101
  if cols is None:
63
102
  first_item = batch[0]
@@ -73,71 +112,118 @@ def _infer_batch(batch, schema, return_mode, cols=None, **kwargs):
73
112
 
74
113
  # Handling column for inference.
75
114
  if c in schema.r2p:
76
- content_it = _iter_batch_responses(c=c, p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
77
- for ind_in_batch, chunk_it in content_it:
78
-
79
- chunks = []
80
- for chunk in chunk_it:
81
- chunks.append(chunk)
82
-
83
- if return_mode == "chunk":
84
- yield [ind_in_batch, c, chunk]
85
-
86
- batch[ind_in_batch][c] = "".join(chunks)
87
-
88
- if return_mode == "record":
115
+ content_it = _iter_chunks(p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
116
+ # Register values.
117
+ for item in batch:
118
+ item[c] = []
119
+ for ind_in_batch, chunk in content_it:
120
+ # Append batch.
121
+ batch[ind_in_batch][c].append(chunk)
122
+ yield [ind_in_batch, c, chunk]
123
+
124
+ # Convert content to string.
125
+ for item in batch:
126
+ item[c] = "".join(item[c])
127
+
128
+ if not keep_prompts:
129
+ for batch_item in batch:
130
+ for key in list(batch_item.keys()):
131
+ prompt_col = SchemaService.col_to_prompt(col_name=key, prompt_data=batch_item)
132
+ if prompt_col in batch_item:
133
+ del batch_item[prompt_col]
134
+
135
+
136
+ def _infer_batch(return_type, batch, batch_ind, **kwargs):
137
+ assert (return_type in ["batch", "chunk", "record"])
138
+
139
+ # Filling batch with inference content.
140
+ for ind_in_batch, column, chunk in _column_ordered_chunks_iter(batch=batch, **kwargs):
141
+ if return_type == "chunk":
142
+ global_ind = batch_ind * len(batch) + ind_in_batch
143
+ yield [global_ind, column, chunk]
144
+
145
+ if return_type == "record":
89
146
  for record in batch:
90
147
  yield record
91
148
 
92
- if return_mode == "batch":
149
+ if return_type == "batch":
93
150
  yield batch
94
151
 
95
152
 
96
- def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None, return_mode="batch", **kwargs):
153
+ def get_infer_mode(stream, batch_size, async_mode):
154
+ if not stream and batch_size == 1:
155
+ return 'single', 'record'
156
+ elif not stream and batch_size > 1:
157
+ if async_mode:
158
+ return 'batch_async', 'batch'
159
+ else:
160
+ return 'batch', 'batch'
161
+ elif stream and batch_size == 1:
162
+ return 'single_stream', 'chunk'
163
+ elif stream and batch_size > 1:
164
+ return 'batch_stream_async', 'chunk'
165
+
166
+ raise ValueError(f"Invalid combination of stream and batch_size: {stream}, {batch_size}")
167
+
168
+
169
+ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
170
+ stream=False, async_mode=False, attempts=1, event_loop=None,
171
+ handle_missed_value_func=lambda *_: None, **kwargs):
97
172
  """ This method represent Python API aimed at application of `llm` towards
98
173
  iterator of input_dicts via cache_target that refers to the SQLite using
99
174
  the given `schema`
100
175
  """
101
- assert (return_mode in ["batch", "chunk"])
102
176
  assert (isinstance(llm, BaseLM))
177
+ assert (isinstance(batch_size, int) and batch_size > 0)
178
+ assert (isinstance(async_mode, bool))
179
+
180
+ infer_type, return_type = get_infer_mode(stream=stream, batch_size=batch_size, async_mode=async_mode)
181
+ infer_mode = INFER_MODES[infer_type]
182
+
183
+ # Setup event loop.
184
+ event_loop = asyncio.get_event_loop_policy().get_event_loop() \
185
+ if event_loop is None else event_loop
103
186
 
104
187
  # Quick initialization of the schema.
105
188
  if isinstance(schema, str):
106
189
  schema = JsonService.read(schema)
107
190
  if isinstance(schema, dict):
108
191
  schema = SchemaService(json_data=schema)
192
+ if isinstance(schema, list):
193
+ schema = SchemaService(json_data={"schema": schema})
109
194
 
110
195
  prompts_it = map(
111
196
  lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
112
197
  input_dicts_it
113
198
  )
114
199
 
115
- content_it = (_infer_batch(batch=batch,
116
- handle_batch_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
117
- return_mode=return_mode,
200
+ handle_batch_func = lambda batch, **handle_kwargs: infer_mode(
201
+ llm,
202
+ DataService.limit_prompts(batch, limit=limit_prompt),
203
+ **handle_kwargs
204
+ )
205
+
206
+ # Optional wrapping into attempts.
207
+ if attempts > 1:
208
+ # Optional setup of the logger.
209
+ logger = logging.getLogger(__name__)
210
+ logging.basicConfig(level=logging.INFO)
211
+
212
+ attempt_dec = attempt_wrapper(attempts=attempts,
213
+ delay_sec=kwargs.get("attempt_delay_sec", 1),
214
+ logger=logger)
215
+ handle_batch_func = attempt_dec(handle_batch_func)
216
+
217
+ kwargs["handle_missed_value_func"] = handle_missed_value_func
218
+
219
+ content_it = (_infer_batch(return_type=return_type,
220
+ batch=batch,
221
+ batch_ind=batch_ind,
222
+ infer_mode=infer_mode,
223
+ handle_batch_func=handle_batch_func,
118
224
  schema=schema,
225
+ event_loop=event_loop,
119
226
  **kwargs)
120
- for batch in BatchIterator(prompts_it, batch_size=batch_size))
227
+ for batch_ind, batch in enumerate(BatchIterator(prompts_it, batch_size=batch_size)))
121
228
 
122
229
  yield from chain.from_iterable(content_it)
123
-
124
-
125
- def init_llm(adapter, **model_kwargs):
126
- """ This method perform dynamic initialization of LLM from third-party resource.
127
- """
128
- assert (isinstance(adapter, str))
129
-
130
- # List of the Supported models and their API wrappers.
131
- models_preset = {
132
- "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
133
- class_name=llm_model_params)(**model_kwargs)
134
- }
135
-
136
- # Initialize LLM model.
137
- params = adapter.split(':')
138
- llm_model_type = params[0]
139
- llm_model_name = params[1] if len(params) > 1 else params[-1]
140
- llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
141
- llm = find_by_prefix(d=models_preset, key=llm_model_type)()
142
-
143
- return llm, llm_model_name
@@ -1,50 +1,29 @@
1
- import logging
2
- import time
3
-
4
-
5
1
  class BaseLM(object):
6
2
 
7
- def __init__(self, name=None, attempts=None, delay_sec=1, enable_log=True,
8
- support_batching=False, **kwargs):
9
-
10
- self.__name = name
11
- self.__attempts = 1 if attempts is None else attempts
12
- self.__delay_sec = delay_sec
13
- self.__support_batching = support_batching
14
-
15
- if enable_log:
16
- self.__logger = logging.getLogger(__name__)
17
- logging.basicConfig(level=logging.INFO)
18
-
19
- def ask_core(self, batch):
20
-
21
- for i in range(self.__attempts):
22
- try:
23
- if self.__support_batching:
24
- # Launch in batch mode.
25
- content = batch
26
- else:
27
- # Launch in non-batch mode.
28
- assert len(batch) == 1, "The LM does not support batching," \
29
- f" while size of the content is {len(batch)} which is not equal 1. " \
30
- f"Please enable batch-supporting or set required inference settings."
31
- content = batch[0]
3
+ def __init__(self, **kwargs):
4
+ pass
32
5
 
33
- response = self.ask(content)
34
-
35
- # Wrapping into batch the response in the case of non-batching mode.
36
- return response if self.__support_batching else [response]
6
+ def ask(self, prompt):
7
+ """ Assumes to return str.
8
+ """
9
+ raise NotImplemented()
37
10
 
38
- except Exception as e:
39
- if self.__logger is not None:
40
- self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
41
- self.__logger.info(e)
42
- time.sleep(self.__delay_sec)
11
+ def ask_batch(self, batch):
12
+ """ Assumes to return generator.
13
+ """
14
+ raise NotImplemented()
43
15
 
44
- raise Exception("Can't infer")
16
+ def ask_stream(self, prompt):
17
+ """ Assumes to return generator.
18
+ """
19
+ raise NotImplemented()
45
20
 
46
- def ask(self, content):
21
+ async def ask_async(self, prompt):
22
+ """ Assumes to return co-routine.
23
+ """
47
24
  raise NotImplemented()
48
25
 
49
- def name(self):
50
- return self.__name.replace("/", "_")
26
+ async def ask_stream_async(self, prompt):
27
+ """ Assumes to return AsyncGenerator.
28
+ """
29
+ raise NotImplemented()
@@ -0,0 +1,65 @@
1
+ import asyncio
2
+ from typing import AsyncGenerator, Any
3
+
4
+
5
+ class AsyncioService:
6
+
7
+ @staticmethod
8
+ async def _run_tasks_async(batch, async_handler):
9
+ tasks = [async_handler(prompt) for prompt in batch]
10
+ return await asyncio.gather(*tasks)
11
+
12
+ @staticmethod
13
+ async def _run_generator(gen, output_queue, idx):
14
+ try:
15
+ async for item in gen:
16
+ await output_queue.put((idx, item))
17
+ finally:
18
+ await output_queue.put((idx, StopAsyncIteration))
19
+
20
+
21
+ @staticmethod
22
+ def run_tasks(event_loop, **tasks_kwargs):
23
+ return event_loop.run_until_complete(AsyncioService._run_tasks_async(**tasks_kwargs))
24
+
25
+ @staticmethod
26
+ async def merge_generators(*gens: AsyncGenerator[Any, None]) -> AsyncGenerator[Any, None]:
27
+
28
+ output_queue = asyncio.Queue()
29
+ tasks = [
30
+ asyncio.create_task(AsyncioService._run_generator(gen, output_queue, idx))
31
+ for idx, gen in enumerate(gens)
32
+ ]
33
+
34
+ finished = set()
35
+ while len(finished) < len(tasks):
36
+ idx, item = await output_queue.get()
37
+ if item is StopAsyncIteration:
38
+ finished.add(idx)
39
+ else:
40
+ yield item
41
+
42
+ for task in tasks:
43
+ task.cancel()
44
+
45
+ @staticmethod
46
+ def async_gen_to_iter(gen, loop=None):
47
+ """ This approach is limited. Could be considered as legacy.
48
+ https://stackoverflow.com/questions/71580727/translating-async-generator-into-sync-one/78573267#78573267
49
+ """
50
+
51
+ loop_created = False
52
+ if loop is None:
53
+ loop_created = True
54
+ loop = asyncio.new_event_loop()
55
+
56
+ asyncio.set_event_loop(loop)
57
+ try:
58
+ while True:
59
+ try:
60
+ yield loop.run_until_complete(gen.__anext__())
61
+ except StopAsyncIteration:
62
+ break
63
+ finally:
64
+ if loop_created:
65
+ loop.close()
@@ -1,8 +1,8 @@
1
1
  class BatchIterator:
2
2
 
3
3
  def __init__(self, data_iter, batch_size, end_value=None, filter_func=None):
4
- assert(isinstance(batch_size, int) and batch_size > 0)
5
- assert(callable(end_value) or end_value is None)
4
+ assert (isinstance(batch_size, int) and batch_size > 0)
5
+ assert (callable(end_value) or end_value is None)
6
6
  self.__data_iter = data_iter
7
7
  self.__index = 0
8
8
  self.__batch_size = batch_size
@@ -9,6 +9,10 @@ class SchemaService(object):
9
9
  prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
10
10
  return cls(prompt_schema)
11
11
 
12
+ @staticmethod
13
+ def col_to_prompt(col_name, prompt_data):
14
+ return col_name + "_prompt" if "in" not in prompt_data else prompt_data["in"]
15
+
12
16
  @staticmethod
13
17
  def __init_schema(prompts):
14
18
 
@@ -19,7 +23,7 @@ class SchemaService(object):
19
23
 
20
24
  for prompt in prompts:
21
25
  r_col_name = prompt["out"]
22
- p_col_name = r_col_name + "_prompt" if "in" not in prompt else prompt["in"]
26
+ p_col_name = SchemaService.col_to_prompt(col_name=r_col_name, prompt_data=prompt)
23
27
 
24
28
  assert r_col_name not in schema_r2p, f"`{r_col_name}` has been already declared!"
25
29
  assert p_col_name not in schema_p2r, f"`{p_col_name}` has been already declared!"
bulk_chain/core/utils.py CHANGED
@@ -1,6 +1,8 @@
1
+ import ast
1
2
  import importlib
2
3
  import logging
3
4
  import sys
5
+ import time
4
6
  from collections import Counter
5
7
  from os.path import dirname, join, basename
6
8
 
@@ -34,18 +36,30 @@ def find_by_prefix(d, key):
34
36
  return d[matches[0]]
35
37
 
36
38
 
39
+ def check_is_param_name(param_name):
40
+ return param_name.replace("_", "").isalpha()
41
+
42
+
37
43
  def iter_params(text):
38
44
  assert(isinstance(text, str))
39
45
  beg = 0
40
46
  while beg < len(text):
47
+ print(beg)
41
48
  try:
42
49
  pb = text.index('{', beg)
43
50
  except ValueError:
44
51
  break
45
- pe = text.index('}', beg+1)
46
- # Yield argument.
47
- yield text[pb+1:pe]
48
- beg = pe+1
52
+ pe = text.index('}', pb+1)
53
+ param_name = text[pb + 1:pe]
54
+
55
+ # Check parameter validity.
56
+ if not check_is_param_name(param_name):
57
+ beg = pb + 1
58
+ continue
59
+
60
+ # Passing.
61
+ yield param_name
62
+ beg = pe + 1
49
63
 
50
64
 
51
65
  def auto_import(name, is_class=False):
@@ -60,10 +74,21 @@ def auto_import(name, is_class=False):
60
74
  return m() if is_class else m
61
75
 
62
76
 
63
- def dynamic_init(class_dir, class_filepath, class_name=None):
77
+ def get_class_name(file_path):
78
+ with open(file_path, 'r') as f:
79
+ tree = ast.parse(f.read(), filename=file_path)
80
+
81
+ for node in ast.walk(tree):
82
+ if isinstance(node, ast.ClassDef):
83
+ return node.name
84
+
85
+ return None
86
+
87
+
88
+ def dynamic_init(class_filepath, class_name=None):
64
89
 
65
90
  # Registering path.
66
- target = join(class_dir, dirname(class_filepath))
91
+ target = join(dirname(class_filepath))
67
92
  logger.info(f"Adding sys path for `{target}`")
68
93
  sys.path.insert(1, target)
69
94
  class_path_list = class_filepath.split('/')
@@ -74,7 +99,7 @@ def dynamic_init(class_dir, class_filepath, class_name=None):
74
99
  class_filename = class_filename[:-len(".py")]
75
100
 
76
101
  # Loading library.
77
- class_name = class_path_list[-1].title() if class_name is None else class_name
102
+ class_name = get_class_name(class_filepath) if class_name is None else class_name
78
103
  class_path = ".".join([class_filename, class_name])
79
104
  logger.info(f"Dynamic loading for the file and class `{class_path}`")
80
105
  cls = auto_import(class_path, is_class=False)
@@ -89,3 +114,21 @@ def optional_limit_iter(it_data, limit=None):
89
114
  if limit is not None and counter["returned"] > limit:
90
115
  break
91
116
  yield data
117
+
118
+
119
+ def attempt_wrapper(attempts, delay_sec=1, logger=None):
120
+ def decorator(func):
121
+ def wrapper(*args, **kwargs):
122
+ for i in range(attempts):
123
+ try:
124
+ # Do action.
125
+ return func(*args, **kwargs)
126
+ except Exception as e:
127
+ if logger is not None:
128
+ logger.info(f"Unable to infer the result. Try {i} out of {attempts}.")
129
+ logger.info(e)
130
+ if delay_sec is not None:
131
+ time.sleep(delay_sec)
132
+ raise Exception(f"Failed after {attempts} attempts")
133
+ return wrapper
134
+ return decorator
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.1
2
+ Name: bulk-chain
3
+ Version: 1.2.1
4
+ Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
+ Home-page: https://github.com/nicolay-r/bulk-chain
6
+ Author: Nicolay Rusnachenko
7
+ Author-email: rusnicolay@gmail.com
8
+ License: MIT License
9
+ Keywords: natural language processing,chain-of-thought,reasoning
10
+ Platform: UNKNOWN
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
14
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Requires-Python: >=3.6
17
+ Description-Content-Type: text/markdown
18
+
19
+ # bulk-chain 1.2.1
20
+ ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
21
+ [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
22
+ [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
23
+ [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
24
+
25
+ <p align="center">
26
+ <img src="logo.png"/>
27
+ </p>
28
+
29
+ <p align="center">
30
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
31
+ <br>
32
+ <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
33
+ </p>
34
+
35
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
36
+
37
+ ### Main Features
38
+ * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
39
+ * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
40
+ * ✅ **Provides iterator over infinite amount of input contexts**
41
+
42
+ # Installation
43
+
44
+ From PyPI:
45
+
46
+ ```bash
47
+ pip install --no-deps bulk-chain
48
+ ```
49
+
50
+ or latest version from here:
51
+
52
+ ```bash
53
+ pip install git+https://github.com/nicolay-r/bulk-chain@master
54
+ ```
55
+
56
+ ## Chain-of-Thought Schema
57
+
58
+ To declare Chain-of-Though (CoT) schema we use `JSON` format.
59
+ The field `schema` is a list of CoT instructions for the Large Language Model.
60
+ Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
61
+ All the variable names should be mentioned in `{}`.
62
+
63
+ **Example**:
64
+ ```python
65
+ [
66
+ {"prompt": "extract topic: {text}", "out": "topic"},
67
+ {"prompt": "extract subject: {text}", "out": "subject"},
68
+ ]
69
+ ```
70
+
71
+ # Usage
72
+
73
+ ## 🤖 Prepare
74
+
75
+ 1. [schema](#chain-of-thought-schema)
76
+ * [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
77
+ 2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
78
+ 3. Data (iter of dictionaries)
79
+
80
+ ## 🚀 Launch
81
+
82
+ > **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
83
+
84
+ ```python
85
+ from bulk_chain.core.utils import dynamic_init
86
+ from bulk_chain.api import iter_content
87
+
88
+ content_it = iter_content(
89
+ # 1. Your schema.
90
+ schema=[
91
+ {"prompt": "extract topic: {text}", "out": "topic" },
92
+ {"prompt": "extract subject: {text}", "out": "subject"},
93
+ ],
94
+ # 2. Your third-party model implementation.
95
+ llm=dynamic_init(class_filepath="replicate_104.py")(
96
+ api_token="<API-KEY>",
97
+ model_name="meta/meta-llama-3-70b-instruct"),
98
+ # 3. Toggle streaming if needed
99
+ stream=False,
100
+ # 4. Toggle Async API mode usage.
101
+ async_mode=True,
102
+ # 5. Batch size.
103
+ batch_size=10,
104
+ # 6. Your iterator of dictionaries
105
+ input_dicts_it=[
106
+ # Example of data ...
107
+ { "text": "Rocks are hard" },
108
+ { "text": "Water is wet" },
109
+ { "text": "Fire is hot" }
110
+ ],
111
+ )
112
+
113
+ for batch in content_it:
114
+ for entry in batch:
115
+ print(entry)
116
+ ```
117
+
118
+ Outputs entries represent texts augmented with `topic` and `subject`:
119
+ ```jsonl
120
+ {'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
121
+ {'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
122
+ {'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
123
+ ```
124
+
125
+ # API
126
+
127
+ | Method | Mode | Description |
128
+ |----------------------|------------|---------------------------------------------------------------------|
129
+ | `ask(prompt)` | Sync | Infers the model with a single prompt. |
130
+ | `ask_stream(prompt)` | Sync | Returns a generator that yields chunks of the inferred result. |
131
+ | `ask_async(prompt)` | Async | Asynchronously infers the model with a single prompt. |
132
+ | `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result. |
133
+
134
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
135
+
136
+
@@ -0,0 +1,16 @@
1
+ bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ bulk_chain/api.py,sha256=bLZXdp58i6LDayZQxRBxsFK4lVT8cZZn1uOY0iaZ5TE,8500
3
+ bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ bulk_chain/core/llm_base.py,sha256=H2KmCqChKp9sKOkROE-4zjMRCxizT9xWvNZSF22HeFU,673
5
+ bulk_chain/core/service_asyncio.py,sha256=S-D4K3LBa3noKTm0tXazluYVI8cBgN1IB6v6MFoMyNQ,1972
6
+ bulk_chain/core/service_batch.py,sha256=lWmjO0aU6h2rmfx_kGmNqt0Rdeaf2a4Dn5VyfKFkfDs,1033
7
+ bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
8
+ bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
9
+ bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
10
+ bulk_chain/core/service_schema.py,sha256=YAsdm3N2G4-eTpeJazg4Y-KQ2w9bEPpqreVl8a-M7H0,1311
11
+ bulk_chain/core/utils.py,sha256=hml0zLmnZe865gvc1CagEzRE19Gdh1pF8kx_KueDY3A,3667
12
+ bulk_chain-1.2.1.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
13
+ bulk_chain-1.2.1.dist-info/METADATA,sha256=xx1vcG6wkHzh_Ga3iZJV3MBdR97RBGpCf7JO5_lonN0,5339
14
+ bulk_chain-1.2.1.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
15
+ bulk_chain-1.2.1.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
16
+ bulk_chain-1.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.3)
2
+ Generator: bdist_wheel (0.34.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,99 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: bulk_chain
3
- Version: 1.0.0
4
- Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
- Home-page: https://github.com/nicolay-r/bulk-chain
6
- Author: Nicolay Rusnachenko
7
- Author-email: rusnicolay@gmail.com
8
- License: MIT License
9
- Keywords: natural language processing,chain-of-thought,reasoning
10
- Classifier: Programming Language :: Python
11
- Classifier: Programming Language :: Python :: 3.9
12
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
14
- Classifier: Topic :: Text Processing :: Linguistic
15
- Requires-Python: >=3.6
16
- Description-Content-Type: text/markdown
17
- License-File: LICENSE
18
- Requires-Dist: tqdm
19
-
20
- # bulk-chain 1.0.0
21
- ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
22
- [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
23
- [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
24
- [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
25
-
26
- <p align="center">
27
- <img src="logo.png"/>
28
- </p>
29
-
30
- <p align="center">
31
- <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
32
- <br>
33
- <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
34
- </p>
35
-
36
- A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
37
-
38
- ### Main Features
39
- * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
40
- * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
41
- * ✅ **Provides iterator over infinite amount of input contexts**
42
-
43
- # Installation
44
-
45
- From PyPI:
46
-
47
- ```bash
48
- pip install --no-deps bulk-chain
49
- ```
50
-
51
- or latest version from here:
52
-
53
- ```bash
54
- pip install git+https://github.com/nicolay-r/bulk-chain@master
55
- ```
56
-
57
- ## Chain-of-Thought Schema
58
-
59
- To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
60
- This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
61
-
62
- Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
63
- All the variable names are expected to be mentioned in `{}`.
64
-
65
- Below, is an example on how to declare your own schema:
66
-
67
- ```python
68
- {
69
- "name": "schema-name",
70
- "schema": [
71
- {"prompt": "Given the question '{text}', let's think step-by-step.",
72
- "out": "steps"},
73
- {"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
74
- "out": "answer"},
75
- ]
76
- }
77
- ```
78
-
79
- # Usage
80
-
81
- Preliminary steps:
82
-
83
- 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
84
- 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
85
-
86
-
87
-
88
- ## API
89
-
90
- Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
91
-
92
-
93
- # Embed your LLM
94
-
95
- All you have to do is to implement `BaseLM` class, that includes:
96
- * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
97
- * `ask(prompt)` -- infer your model with the given `prompt`.
98
-
99
- See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -1,15 +0,0 @@
1
- bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- bulk_chain/api.py,sha256=d_c10Je8wUSnCdQjyWCHVx4FGW6M2_pBMMqKsI_YJaY,5119
3
- bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- bulk_chain/core/llm_base.py,sha256=DZ9l4HpCs9uKTZp68miw_XCqmRAJBqQPuYSK889CeUk,1785
5
- bulk_chain/core/service_batch.py,sha256=LMxrZeQXV_AJAoCaMCHVx8TvjcmCaKUQhNE8K4D8pCo,1031
6
- bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
7
- bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
8
- bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
9
- bulk_chain/core/service_schema.py,sha256=KIP4n0Tz2h1i7SIMGhgAhoiCgUFXOT1rzMt38yACS2U,1154
10
- bulk_chain/core/utils.py,sha256=Dx9Gy-jPpk-w_8WUekN0Ij4RBIWVAPg74vA3N0JgGqc,2471
11
- bulk_chain-1.0.0.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
12
- bulk_chain-1.0.0.dist-info/METADATA,sha256=TR86CmhcHJ3Sep8TlHZ0Ede_PnH8G5iMILUvVvSskJY,3810
13
- bulk_chain-1.0.0.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
14
- bulk_chain-1.0.0.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
15
- bulk_chain-1.0.0.dist-info/RECORD,,