bulk-chain 1.0.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/PKG-INFO +28 -9
  2. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/README.md +27 -7
  3. bulk_chain-1.1.0/bulk_chain/api.py +186 -0
  4. bulk_chain-1.1.0/bulk_chain/core/llm_base.py +24 -0
  5. bulk_chain-1.1.0/bulk_chain/core/service_asyncio.py +65 -0
  6. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_batch.py +2 -2
  7. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/utils.py +21 -2
  8. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/PKG-INFO +28 -9
  9. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/SOURCES.txt +4 -4
  10. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/setup.py +1 -2
  11. bulk_chain-1.1.0/test/test_api.py +67 -0
  12. bulk_chain-1.0.0/test/test_provider_batching.py → bulk_chain-1.1.0/test/test_api_batching.py +3 -5
  13. bulk_chain-1.1.0/test/test_api_streaming.py +21 -0
  14. bulk_chain-1.1.0/test/test_replicate_async_baseline.py +11 -0
  15. bulk_chain-1.1.0/test/test_replicate_async_batch_async.py +37 -0
  16. bulk_chain-1.0.0/bulk_chain/api.py +0 -143
  17. bulk_chain-1.0.0/bulk_chain/core/llm_base.py +0 -50
  18. bulk_chain-1.0.0/bulk_chain.egg-info/requires.txt +0 -1
  19. bulk_chain-1.0.0/test/test.py +0 -62
  20. bulk_chain-1.0.0/test/test_api.py +0 -34
  21. bulk_chain-1.0.0/test/test_api_streaming.py +0 -52
  22. bulk_chain-1.0.0/test/test_args_seeking.py +0 -26
  23. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/LICENSE +0 -0
  24. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/__init__.py +0 -0
  25. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/__init__.py +0 -0
  26. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_data.py +0 -0
  27. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_dict.py +0 -0
  28. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_json.py +0 -0
  29. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain/core/service_schema.py +0 -0
  30. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/dependency_links.txt +0 -0
  31. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/bulk_chain.egg-info/top_level.txt +0 -0
  32. {bulk_chain-1.0.0 → bulk_chain-1.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -15,9 +15,8 @@ Classifier: Topic :: Text Processing :: Linguistic
15
15
  Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: tqdm
19
18
 
20
- # bulk-chain 1.0.0
19
+ # bulk-chain 1.1.0
21
20
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
22
21
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
23
22
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -78,16 +77,36 @@ Below, is an example on how to declare your own schema:
78
77
 
79
78
  # Usage
80
79
 
81
- Preliminary steps:
80
+ ## 🤖 Prepare
82
81
 
83
- 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
84
- 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
82
+ 1. [schema](#chain-of-thought-schema)
83
+ * [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
84
+ 2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
85
+ 3. Data (iter of dictionaries)
85
86
 
87
+ ## 🚀 Launch
86
88
 
89
+ > **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
87
90
 
88
- ## API
89
-
90
- Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
91
+ ```python
92
+ from bulk_chain.core.utils import dynamic_init
93
+ from bulk_chain.api import iter_content
94
+
95
+ content_it = iter_content(
96
+ # 1. Your schema.
97
+ schema="YOUR_SCHEMA.json",
98
+ # 2. Your third-party model implementation.
99
+ llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
100
+ # 3. Customize your inference and result providing modes:
101
+ infer_mode="batch_async",
102
+ return_mode="batch",
103
+ # 4. Your iterator of dictionaries
104
+ input_dicts_it=YOUR_DATA_IT,
105
+ )
106
+
107
+ for content in content_it:
108
+ # Handle your LLM responses here ...
109
+ ```
91
110
 
92
111
 
93
112
  # Embed your LLM
@@ -1,4 +1,4 @@
1
- # bulk-chain 1.0.0
1
+ # bulk-chain 1.1.0
2
2
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
3
3
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
4
4
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -59,16 +59,36 @@ Below, is an example on how to declare your own schema:
59
59
 
60
60
  # Usage
61
61
 
62
- Preliminary steps:
62
+ ## 🤖 Prepare
63
63
 
64
- 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
65
- 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
64
+ 1. [schema](#chain-of-thought-schema)
65
+ * [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
66
+ 2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
67
+ 3. Data (iter of dictionaries)
66
68
 
69
+ ## 🚀 Launch
67
70
 
71
+ > **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
68
72
 
69
- ## API
70
-
71
- Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
73
+ ```python
74
+ from bulk_chain.core.utils import dynamic_init
75
+ from bulk_chain.api import iter_content
76
+
77
+ content_it = iter_content(
78
+ # 1. Your schema.
79
+ schema="YOUR_SCHEMA.json",
80
+ # 2. Your third-party model implementation.
81
+ llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
82
+ # 3. Customize your inference and result providing modes:
83
+ infer_mode="batch_async",
84
+ return_mode="batch",
85
+ # 4. Your iterator of dictionaries
86
+ input_dicts_it=YOUR_DATA_IT,
87
+ )
88
+
89
+ for content in content_it:
90
+ # Handle your LLM responses here ...
91
+ ```
72
92
 
73
93
 
74
94
  # Embed your LLM
@@ -0,0 +1,186 @@
1
+ import asyncio
2
+ import collections
3
+ import logging
4
+ import os
5
+ from itertools import chain
6
+
7
+ from bulk_chain.core.llm_base import BaseLM
8
+ from bulk_chain.core.service_asyncio import AsyncioService
9
+ from bulk_chain.core.service_batch import BatchIterator
10
+ from bulk_chain.core.service_data import DataService
11
+ from bulk_chain.core.service_dict import DictionaryService
12
+ from bulk_chain.core.service_json import JsonService
13
+ from bulk_chain.core.service_schema import SchemaService
14
+ from bulk_chain.core.utils import attempt_wrapper
15
+
16
+
17
+ INFER_MODES = {
18
+ "single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
19
+ "single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
20
+ "batch": lambda llm, batch, **kwargs: llm.ask(batch),
21
+ "batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
22
+ batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
23
+ ),
24
+ "batch_stream_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
25
+ batch=batch, async_handler=llm.ask_stream_async, event_loop=kwargs.get("event_loop")
26
+ ),
27
+ }
28
+
29
+
30
+ CWD = os.getcwd()
31
+
32
+
33
+ def _iter_batch_prompts(c, batch_content_it, **kwargs):
34
+ for ind_in_batch, entry in enumerate(batch_content_it):
35
+ content = DataService.get_prompt_text(
36
+ prompt=entry[c]["prompt"],
37
+ data_dict=entry,
38
+ handle_missed_func=kwargs["handle_missed_value_func"])
39
+ yield ind_in_batch, content
40
+
41
+
42
+ def __handle_agen_to_gen(handle, batch, event_loop):
43
+ """ This handler provides conversion of the async generator to generator (sync).
44
+ """
45
+
46
+ def __wrap_with_index(async_gens):
47
+ async def wrapper(index, agen):
48
+ async for item in agen:
49
+ yield index, item
50
+ return [wrapper(i, agen) for i, agen in enumerate(async_gens)]
51
+
52
+ agen_list = handle(batch, event_loop=event_loop)
53
+
54
+ it = AsyncioService.async_gen_to_iter(
55
+ gen=AsyncioService.merge_generators(*__wrap_with_index(agen_list)),
56
+ loop=event_loop)
57
+
58
+ for ind_in_batch, chunk in it:
59
+ yield ind_in_batch, str(chunk)
60
+
61
+
62
+ def __handle_gen(handle, batch, event_loop):
63
+ """ This handler deals with the iteration of each individual element of the batch.
64
+ """
65
+
66
+ def _iter_entry_content(entry):
67
+ if isinstance(entry, str):
68
+ yield entry
69
+ elif isinstance(entry, collections.abc.Iterable):
70
+ for chunk in map(lambda item: str(item), entry):
71
+ yield chunk
72
+ else:
73
+ raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
74
+
75
+ for ind_in_batch, entry in enumerate(handle(batch, event_loop=event_loop)):
76
+ for chunk in _iter_entry_content(entry=entry):
77
+ yield ind_in_batch, chunk
78
+
79
+
80
+ def _iter_chunks(p_column, batch_content_it, **kwargs):
81
+ handler = __handle_agen_to_gen if kwargs["infer_mode"] == "batch_stream_async" else __handle_gen
82
+ p_batch = [item[p_column] for item in batch_content_it]
83
+ it = handler(handle=kwargs["handle_batch_func"], batch=p_batch, event_loop=kwargs["event_loop"])
84
+ for ind_in_batch, chunk in it:
85
+ yield ind_in_batch, chunk
86
+
87
+
88
+ def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
89
+ assert (isinstance(batch, list))
90
+
91
+ if len(batch) == 0:
92
+ return batch
93
+
94
+ if cols is None:
95
+ first_item = batch[0]
96
+ cols = list(first_item.keys()) if cols is None else cols
97
+
98
+ for c in cols:
99
+
100
+ # Handling prompt column.
101
+ if c in schema.p2r:
102
+ content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
103
+ for ind_in_batch, prompt in content_it:
104
+ batch[ind_in_batch][c] = prompt
105
+
106
+ # Handling column for inference.
107
+ if c in schema.r2p:
108
+ content_it = _iter_chunks(p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
109
+ # Register values.
110
+ for item in batch:
111
+ item[c] = []
112
+ for ind_in_batch, chunk in content_it:
113
+ # Append batch.
114
+ batch[ind_in_batch][c].append(chunk)
115
+ # Returning (optional).
116
+ if return_mode == "chunk":
117
+ global_ind = batch_ind * len(batch) + ind_in_batch
118
+ yield [global_ind, c, chunk]
119
+
120
+ # Convert content to string.
121
+ for item in batch:
122
+ item[c] = "".join(item[c])
123
+
124
+ if return_mode == "record":
125
+ for record in batch:
126
+ yield record
127
+
128
+ if return_mode == "batch":
129
+ yield batch
130
+
131
+
132
+ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
133
+ infer_mode="batch", return_mode="batch", attempts=1, event_loop=None,
134
+ **kwargs):
135
+ """ This method represent Python API aimed at application of `llm` towards
136
+ iterator of input_dicts via cache_target that refers to the SQLite using
137
+ the given `schema`
138
+ """
139
+ assert (infer_mode in INFER_MODES.keys())
140
+ assert (return_mode in ["batch", "chunk", "record"])
141
+ assert (isinstance(llm, BaseLM))
142
+
143
+ # Setup event loop.
144
+ event_loop = asyncio.get_event_loop_policy().get_event_loop() \
145
+ if event_loop is None else event_loop
146
+
147
+ # Quick initialization of the schema.
148
+ if isinstance(schema, str):
149
+ schema = JsonService.read(schema)
150
+ if isinstance(schema, dict):
151
+ schema = SchemaService(json_data=schema)
152
+
153
+ prompts_it = map(
154
+ lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
155
+ input_dicts_it
156
+ )
157
+
158
+ handle_batch_func = lambda batch, **handle_kwargs: INFER_MODES[infer_mode](
159
+ llm,
160
+ DataService.limit_prompts(batch, limit=limit_prompt),
161
+ **handle_kwargs
162
+ )
163
+
164
+ # Optional wrapping into attempts.
165
+ if attempts > 1:
166
+ # Optional setup of the logger.
167
+ logger = logging.getLogger(__name__)
168
+ logging.basicConfig(level=logging.INFO)
169
+
170
+ attempt_dec = attempt_wrapper(attempts=attempts,
171
+ delay_sec=kwargs.get("attempt_delay_sec", 1),
172
+ logger=logger)
173
+ handle_batch_func = attempt_dec(handle_batch_func)
174
+
175
+ content_it = (_infer_batch(batch=batch,
176
+ batch_ind=batch_ind,
177
+ infer_mode=infer_mode,
178
+ handle_batch_func=handle_batch_func,
179
+ handle_missed_value_func=lambda *_: None,
180
+ return_mode=return_mode,
181
+ schema=schema,
182
+ event_loop=event_loop,
183
+ **kwargs)
184
+ for batch_ind, batch in enumerate(BatchIterator(prompts_it, batch_size=batch_size)))
185
+
186
+ yield from chain.from_iterable(content_it)
@@ -0,0 +1,24 @@
1
+ class BaseLM(object):
2
+
3
+ def __init__(self, **kwargs):
4
+ pass
5
+
6
+ def ask(self, content):
7
+ """ Assumes to return str.
8
+ """
9
+ raise NotImplemented()
10
+
11
+ def ask_stream(self, content):
12
+ """ Assumes to return generator.
13
+ """
14
+ raise NotImplemented()
15
+
16
+ async def ask_async(self, prompt):
17
+ """ Assumes to return co-routine.
18
+ """
19
+ raise NotImplemented()
20
+
21
+ async def ask_stream_async(self, batch):
22
+ """ Assumes to return AsyncGenerator.
23
+ """
24
+ raise NotImplemented()
@@ -0,0 +1,65 @@
1
+ import asyncio
2
+ from typing import AsyncGenerator, Any
3
+
4
+
5
+ class AsyncioService:
6
+
7
+ @staticmethod
8
+ async def _run_tasks_async(batch, async_handler):
9
+ tasks = [async_handler(prompt) for prompt in batch]
10
+ return await asyncio.gather(*tasks)
11
+
12
+ @staticmethod
13
+ async def _run_generator(gen, output_queue, idx):
14
+ try:
15
+ async for item in gen:
16
+ await output_queue.put((idx, item))
17
+ finally:
18
+ await output_queue.put((idx, StopAsyncIteration))
19
+
20
+
21
+ @staticmethod
22
+ def run_tasks(event_loop, **tasks_kwargs):
23
+ return event_loop.run_until_complete(AsyncioService._run_tasks_async(**tasks_kwargs))
24
+
25
+ @staticmethod
26
+ async def merge_generators(*gens: AsyncGenerator[Any, None]) -> AsyncGenerator[Any, None]:
27
+
28
+ output_queue = asyncio.Queue()
29
+ tasks = [
30
+ asyncio.create_task(AsyncioService._run_generator(gen, output_queue, idx))
31
+ for idx, gen in enumerate(gens)
32
+ ]
33
+
34
+ finished = set()
35
+ while len(finished) < len(tasks):
36
+ idx, item = await output_queue.get()
37
+ if item is StopAsyncIteration:
38
+ finished.add(idx)
39
+ else:
40
+ yield item
41
+
42
+ for task in tasks:
43
+ task.cancel()
44
+
45
+ @staticmethod
46
+ def async_gen_to_iter(gen, loop=None):
47
+ """ This approach is limited. Could be considered as legacy.
48
+ https://stackoverflow.com/questions/71580727/translating-async-generator-into-sync-one/78573267#78573267
49
+ """
50
+
51
+ loop_created = False
52
+ if loop is None:
53
+ loop_created = True
54
+ loop = asyncio.new_event_loop()
55
+
56
+ asyncio.set_event_loop(loop)
57
+ try:
58
+ while True:
59
+ try:
60
+ yield loop.run_until_complete(gen.__anext__())
61
+ except StopAsyncIteration:
62
+ break
63
+ finally:
64
+ if loop_created:
65
+ loop.close()
@@ -1,8 +1,8 @@
1
1
  class BatchIterator:
2
2
 
3
3
  def __init__(self, data_iter, batch_size, end_value=None, filter_func=None):
4
- assert(isinstance(batch_size, int) and batch_size > 0)
5
- assert(callable(end_value) or end_value is None)
4
+ assert (isinstance(batch_size, int) and batch_size > 0)
5
+ assert (callable(end_value) or end_value is None)
6
6
  self.__data_iter = data_iter
7
7
  self.__index = 0
8
8
  self.__batch_size = batch_size
@@ -1,6 +1,7 @@
1
1
  import importlib
2
2
  import logging
3
3
  import sys
4
+ import time
4
5
  from collections import Counter
5
6
  from os.path import dirname, join, basename
6
7
 
@@ -60,10 +61,10 @@ def auto_import(name, is_class=False):
60
61
  return m() if is_class else m
61
62
 
62
63
 
63
- def dynamic_init(class_dir, class_filepath, class_name=None):
64
+ def dynamic_init(class_filepath, class_name=None):
64
65
 
65
66
  # Registering path.
66
- target = join(class_dir, dirname(class_filepath))
67
+ target = join(dirname(class_filepath))
67
68
  logger.info(f"Adding sys path for `{target}`")
68
69
  sys.path.insert(1, target)
69
70
  class_path_list = class_filepath.split('/')
@@ -89,3 +90,21 @@ def optional_limit_iter(it_data, limit=None):
89
90
  if limit is not None and counter["returned"] > limit:
90
91
  break
91
92
  yield data
93
+
94
+
95
+ def attempt_wrapper(attempts, delay_sec=1, logger=None):
96
+ def decorator(func):
97
+ def wrapper(*args, **kwargs):
98
+ for i in range(attempts):
99
+ try:
100
+ # Do action.
101
+ return func(*args, **kwargs)
102
+ except Exception as e:
103
+ if logger is not None:
104
+ logger.info(f"Unable to infer the result. Try {i} out of {attempts}.")
105
+ logger.info(e)
106
+ if delay_sec is not None:
107
+ time.sleep(delay_sec)
108
+ raise Exception(f"Failed after {attempts} attempts")
109
+ return wrapper
110
+ return decorator
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -15,9 +15,8 @@ Classifier: Topic :: Text Processing :: Linguistic
15
15
  Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: tqdm
19
18
 
20
- # bulk-chain 1.0.0
19
+ # bulk-chain 1.1.0
21
20
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
22
21
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
23
22
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -78,16 +77,36 @@ Below, is an example on how to declare your own schema:
78
77
 
79
78
  # Usage
80
79
 
81
- Preliminary steps:
80
+ ## 🤖 Prepare
82
81
 
83
- 1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
84
- 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
82
+ 1. [schema](#chain-of-thought-schema)
83
+ * [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
84
+ 2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
85
+ 3. Data (iter of dictionaries)
85
86
 
87
+ ## 🚀 Launch
86
88
 
89
+ > **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
87
90
 
88
- ## API
89
-
90
- Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
91
+ ```python
92
+ from bulk_chain.core.utils import dynamic_init
93
+ from bulk_chain.api import iter_content
94
+
95
+ content_it = iter_content(
96
+ # 1. Your schema.
97
+ schema="YOUR_SCHEMA.json",
98
+ # 2. Your third-party model implementation.
99
+ llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
100
+ # 3. Customize your inference and result providing modes:
101
+ infer_mode="batch_async",
102
+ return_mode="batch",
103
+ # 4. Your iterator of dictionaries
104
+ input_dicts_it=YOUR_DATA_IT,
105
+ )
106
+
107
+ for content in content_it:
108
+ # Handle your LLM responses here ...
109
+ ```
91
110
 
92
111
 
93
112
  # Embed your LLM
@@ -6,18 +6,18 @@ bulk_chain/api.py
6
6
  bulk_chain.egg-info/PKG-INFO
7
7
  bulk_chain.egg-info/SOURCES.txt
8
8
  bulk_chain.egg-info/dependency_links.txt
9
- bulk_chain.egg-info/requires.txt
10
9
  bulk_chain.egg-info/top_level.txt
11
10
  bulk_chain/core/__init__.py
12
11
  bulk_chain/core/llm_base.py
12
+ bulk_chain/core/service_asyncio.py
13
13
  bulk_chain/core/service_batch.py
14
14
  bulk_chain/core/service_data.py
15
15
  bulk_chain/core/service_dict.py
16
16
  bulk_chain/core/service_json.py
17
17
  bulk_chain/core/service_schema.py
18
18
  bulk_chain/core/utils.py
19
- test/test.py
20
19
  test/test_api.py
20
+ test/test_api_batching.py
21
21
  test/test_api_streaming.py
22
- test/test_args_seeking.py
23
- test/test_provider_batching.py
22
+ test/test_replicate_async_baseline.py
23
+ test/test_replicate_async_batch_async.py
@@ -15,7 +15,7 @@ def get_requirements(filenames):
15
15
 
16
16
  setup(
17
17
  name='bulk_chain',
18
- version='1.0.0',
18
+ version='1.1.0',
19
19
  python_requires=">=3.6",
20
20
  description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
21
21
  'ensuring reliable results for bulk input requests.',
@@ -36,5 +36,4 @@ setup(
36
36
  'chain-of-thought, '
37
37
  'reasoning',
38
38
  packages=find_packages(),
39
- install_requires=get_requirements(['dependencies.txt'])
40
39
  )
@@ -0,0 +1,67 @@
1
+ import unittest
2
+ from os.path import join
3
+
4
+ from bulk_chain.api import iter_content
5
+ from utils import current_dir, DEFAULT_REMOTE_LLM
6
+
7
+
8
+ class TestAPI(unittest.TestCase):
9
+
10
+
11
+ @staticmethod
12
+ def it_data(n):
13
+ for i in range(n):
14
+ yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
15
+
16
+ def test_single(self):
17
+ data_it = iter_content(input_dicts_it=self.it_data(20),
18
+ llm=DEFAULT_REMOTE_LLM,
19
+ batch_size=1,
20
+ infer_mode="single",
21
+ return_mode="batch",
22
+ schema=join(current_dir, "schema/default.json"))
23
+
24
+ for data in data_it:
25
+ print(data)
26
+
27
+ def test_single_stream(self):
28
+ """ Returns individual chunks.
29
+ """
30
+ data_it = iter_content(input_dicts_it=self.it_data(20),
31
+ llm=DEFAULT_REMOTE_LLM,
32
+ batch_size=1,
33
+ infer_mode="single_stream",
34
+ return_mode="chunk",
35
+ schema=join(current_dir, "schema/default.json"))
36
+
37
+ for data in data_it:
38
+ print(data)
39
+
40
+ def test_batch_async(self):
41
+ """ Return batches that passed async at the Replicate.
42
+ """
43
+ data_it = iter_content(input_dicts_it=self.it_data(20),
44
+ llm=DEFAULT_REMOTE_LLM,
45
+ batch_size=5,
46
+ infer_mode="batch_async",
47
+ return_mode="batch",
48
+ schema=join(current_dir, "schema/default.json"))
49
+
50
+ for batch in data_it:
51
+ for item in batch:
52
+ print(item)
53
+
54
+ def test_batch_stream_async(self):
55
+ data_it = iter_content(input_dicts_it=self.it_data(20),
56
+ llm=DEFAULT_REMOTE_LLM,
57
+ batch_size=5,
58
+ infer_mode="batch_stream_async",
59
+ return_mode="chunk",
60
+ schema=join(current_dir, "schema/default.json"))
61
+
62
+ for chunk_info in data_it:
63
+ print(chunk_info)
64
+
65
+
66
+ if __name__ == '__main__':
67
+ unittest.main()
@@ -1,17 +1,15 @@
1
1
  import unittest
2
- from os.path import join
3
2
 
4
3
  from tqdm import tqdm
5
4
 
6
- from bulk_chain.api import CWD, iter_content
5
+ from bulk_chain.api import iter_content
7
6
  from bulk_chain.core.utils import dynamic_init
8
7
  from utils import iter_test_jsonl_samples
9
8
 
10
9
 
11
10
  class TestProviderBatching(unittest.TestCase):
12
11
 
13
- llm = dynamic_init(class_dir=join(CWD),
14
- class_filepath="providers/transformers_flan_t5.py",
12
+ llm = dynamic_init(class_filepath="providers/transformers_flan_t5.py",
15
13
  class_name="FlanT5")(model_name="nicolay-r/flan-t5-tsa-thor-base",
16
14
  max_new_tokens=128)
17
15
 
@@ -19,9 +17,9 @@ class TestProviderBatching(unittest.TestCase):
19
17
  input_dicts_it = iter_test_jsonl_samples()
20
18
  data_it = iter_content(input_dicts_it=input_dicts_it,
21
19
  llm=self.llm,
20
+ infer_mode="batch",
22
21
  batch_size=10,
23
22
  return_batch=False,
24
- handle_missed_value_func=lambda *_: None,
25
23
  schema="schema/thor_cot_schema.json")
26
24
 
27
25
  for item in tqdm(data_it):
@@ -0,0 +1,21 @@
1
+ import unittest
2
+
3
+ from bulk_chain.api import iter_content
4
+ from utils import iter_test_jsonl_samples, DEFAULT_REMOTE_LLM
5
+
6
+
7
+ class TestAPI_Streaming(unittest.TestCase):
8
+
9
+ def test_content_iter_mode(self):
10
+
11
+ input_dicts_it = iter_test_jsonl_samples()
12
+ data_it = iter_content(input_dicts_it=input_dicts_it,
13
+ llm=DEFAULT_REMOTE_LLM,
14
+ batch_size=1,
15
+ infer_mode="single_stream",
16
+ return_mode="chunk",
17
+ attempts=2,
18
+ schema="schema/thor_cot_schema.json")
19
+
20
+ for ind_in_batch, col, item in data_it:
21
+ print("\t".join([str(ind_in_batch), str(col), item]))
@@ -0,0 +1,11 @@
1
+ from timeit import default_timer as timer
2
+ from utils import DEFAULT_REMOTE_LLM
3
+
4
+ start = timer()
5
+ r = ["".join([str(s) for s in DEFAULT_REMOTE_LLM.ask(f"what's the color of the {p}")])
6
+ for p in ["sky", "ground", "water"]]
7
+ end = timer()
8
+
9
+ total = sum(len(i) for i in r)
10
+ print(f"Completed [time: {end - start}]: {len(r[0])}, {len(r[1])}, {len(r[2])}")
11
+ print(f"TPS: {total / (end-start)}")
@@ -0,0 +1,37 @@
1
+ from timeit import default_timer as timer
2
+ import asyncio
3
+
4
+ from utils import DEFAULT_REMOTE_LLM
5
+
6
+
7
+ async def infer_item(prompt):
8
+ content = []
9
+ for chunk in DEFAULT_REMOTE_LLM.ask(prompt):
10
+ content.append(str(chunk))
11
+ return content
12
+
13
+
14
+ async def coro_infer_llm(prompt):
15
+ print(f"launch: {prompt}")
16
+ r = None
17
+ for response in asyncio.as_completed([infer_item(prompt)]):
18
+ r = await response
19
+ return "".join(r)
20
+
21
+
22
+ async def main():
23
+ batch = [f"what's the color of the {p}" for p in ["sky", "ground", "water"]]
24
+ routines = [coro_infer_llm(p) for p in batch]
25
+ return await asyncio.gather(*routines)
26
+
27
+
28
+ start = timer()
29
+ r = asyncio.run(main())
30
+ end = timer()
31
+
32
+ total = sum(len(i) for i in r)
33
+ print(r[0])
34
+ print(r[1])
35
+ print(r[2])
36
+ print(f"Completed [time: {end - start}]: {len(r[0])}, {len(r[1])}, {len(r[2])}")
37
+ print(f"TPS: {total / (end-start)}")
@@ -1,143 +0,0 @@
1
- import collections
2
- import os
3
- from itertools import chain
4
-
5
- from bulk_chain.core.llm_base import BaseLM
6
- from bulk_chain.core.service_batch import BatchIterator
7
- from bulk_chain.core.service_data import DataService
8
- from bulk_chain.core.service_dict import DictionaryService
9
- from bulk_chain.core.service_json import JsonService
10
- from bulk_chain.core.service_schema import SchemaService
11
- from bulk_chain.core.utils import dynamic_init, find_by_prefix
12
-
13
-
14
- INFER_MODES = {
15
- "batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
16
- DataService.limit_prompts(batch, limit=limit_prompt))
17
- }
18
-
19
-
20
- CWD = os.getcwd()
21
-
22
-
23
- def _iter_entry_content(entry, entry_info=None, **kwargs):
24
-
25
- if isinstance(entry, str):
26
- kwargs.get("callback_str_func", lambda *_: None)(entry, entry_info)
27
- yield entry
28
- elif isinstance(entry, collections.abc.Iterable):
29
- h = kwargs.get("callback_stream_func", lambda *_: None)
30
- h(None, entry_info | {"action": "start"})
31
- for chunk in map(lambda item: str(item), entry):
32
- yield chunk
33
- h(chunk, entry_info)
34
- h(None, entry_info | {"action": "end"})
35
- else:
36
- raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
37
-
38
-
39
- def _iter_batch_prompts(c, batch_content_it, **kwargs):
40
- for ind_in_batch, entry in enumerate(batch_content_it):
41
- content = DataService.get_prompt_text(
42
- prompt=entry[c]["prompt"],
43
- data_dict=entry,
44
- handle_missed_func=kwargs["handle_missed_value_func"])
45
- yield ind_in_batch, content
46
-
47
-
48
- def _iter_batch_responses(p_column, c, batch_content_it, **kwargs):
49
- p_batch = [item[p_column] for item in batch_content_it]
50
- # TODO. This part could be async.
51
- # TODO. ind_in_batch might be a part of the async return.
52
- for ind_in_batch, entry in enumerate(kwargs["handle_batch_func"](p_batch)):
53
- yield ind_in_batch, _iter_entry_content(entry=entry, entry_info={"ind": ind_in_batch, "param": c}, **kwargs)
54
-
55
-
56
- def _infer_batch(batch, schema, return_mode, cols=None, **kwargs):
57
- assert (isinstance(batch, list))
58
-
59
- if len(batch) == 0:
60
- return batch
61
-
62
- if cols is None:
63
- first_item = batch[0]
64
- cols = list(first_item.keys()) if cols is None else cols
65
-
66
- for c in cols:
67
-
68
- # Handling prompt column.
69
- if c in schema.p2r:
70
- content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
71
- for ind_in_batch, prompt in content_it:
72
- batch[ind_in_batch][c] = prompt
73
-
74
- # Handling column for inference.
75
- if c in schema.r2p:
76
- content_it = _iter_batch_responses(c=c, p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
77
- for ind_in_batch, chunk_it in content_it:
78
-
79
- chunks = []
80
- for chunk in chunk_it:
81
- chunks.append(chunk)
82
-
83
- if return_mode == "chunk":
84
- yield [ind_in_batch, c, chunk]
85
-
86
- batch[ind_in_batch][c] = "".join(chunks)
87
-
88
- if return_mode == "record":
89
- for record in batch:
90
- yield record
91
-
92
- if return_mode == "batch":
93
- yield batch
94
-
95
-
96
- def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None, return_mode="batch", **kwargs):
97
- """ This method represent Python API aimed at application of `llm` towards
98
- iterator of input_dicts via cache_target that refers to the SQLite using
99
- the given `schema`
100
- """
101
- assert (return_mode in ["batch", "chunk"])
102
- assert (isinstance(llm, BaseLM))
103
-
104
- # Quick initialization of the schema.
105
- if isinstance(schema, str):
106
- schema = JsonService.read(schema)
107
- if isinstance(schema, dict):
108
- schema = SchemaService(json_data=schema)
109
-
110
- prompts_it = map(
111
- lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
112
- input_dicts_it
113
- )
114
-
115
- content_it = (_infer_batch(batch=batch,
116
- handle_batch_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
117
- return_mode=return_mode,
118
- schema=schema,
119
- **kwargs)
120
- for batch in BatchIterator(prompts_it, batch_size=batch_size))
121
-
122
- yield from chain.from_iterable(content_it)
123
-
124
-
125
- def init_llm(adapter, **model_kwargs):
126
- """ This method perform dynamic initialization of LLM from third-party resource.
127
- """
128
- assert (isinstance(adapter, str))
129
-
130
- # List of the Supported models and their API wrappers.
131
- models_preset = {
132
- "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
133
- class_name=llm_model_params)(**model_kwargs)
134
- }
135
-
136
- # Initialize LLM model.
137
- params = adapter.split(':')
138
- llm_model_type = params[0]
139
- llm_model_name = params[1] if len(params) > 1 else params[-1]
140
- llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
141
- llm = find_by_prefix(d=models_preset, key=llm_model_type)()
142
-
143
- return llm, llm_model_name
@@ -1,50 +0,0 @@
1
- import logging
2
- import time
3
-
4
-
5
- class BaseLM(object):
6
-
7
- def __init__(self, name=None, attempts=None, delay_sec=1, enable_log=True,
8
- support_batching=False, **kwargs):
9
-
10
- self.__name = name
11
- self.__attempts = 1 if attempts is None else attempts
12
- self.__delay_sec = delay_sec
13
- self.__support_batching = support_batching
14
-
15
- if enable_log:
16
- self.__logger = logging.getLogger(__name__)
17
- logging.basicConfig(level=logging.INFO)
18
-
19
- def ask_core(self, batch):
20
-
21
- for i in range(self.__attempts):
22
- try:
23
- if self.__support_batching:
24
- # Launch in batch mode.
25
- content = batch
26
- else:
27
- # Launch in non-batch mode.
28
- assert len(batch) == 1, "The LM does not support batching," \
29
- f" while size of the content is {len(batch)} which is not equal 1. " \
30
- f"Please enable batch-supporting or set required inference settings."
31
- content = batch[0]
32
-
33
- response = self.ask(content)
34
-
35
- # Wrapping into batch the response in the case of non-batching mode.
36
- return response if self.__support_batching else [response]
37
-
38
- except Exception as e:
39
- if self.__logger is not None:
40
- self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
41
- self.__logger.info(e)
42
- time.sleep(self.__delay_sec)
43
-
44
- raise Exception("Can't infer")
45
-
46
- def ask(self, content):
47
- raise NotImplemented()
48
-
49
- def name(self):
50
- return self.__name.replace("/", "_")
@@ -1 +0,0 @@
1
- tqdm
@@ -1,62 +0,0 @@
1
- import random
2
- import time
3
- import logging
4
-
5
-
6
- def setup_logger_behaviour(name: str) -> logging.Logger:
7
- root_handlers = logging.getLogger().handlers # gets root logger
8
- current_logger = logging.getLogger(name) # gets current logger
9
- if not root_handlers: # if root logger has no handlers then create streaming handeler only
10
- new_handler = logging.StreamHandler()
11
- new_handler.terminator = ""
12
- new_handler.setFormatter(logging.Formatter("%(message)s"))
13
- current_logger.addHandler(new_handler)
14
- current_logger.propagate = False
15
- current_logger.setLevel(logging.INFO)
16
- return current_logger
17
-
18
- # Remove exixting Handlers from the current logger
19
- for handler in current_logger.handlers[:]:
20
- current_logger.removeHandler(handler)
21
-
22
- for handler_r in root_handlers: # if root logger has handlers
23
- if type(handler_r) is logging.StreamHandler: # if root logger has streaming handler
24
- new_handler = logging.StreamHandler()
25
- new_handler.terminator = "" # This will stop the printing in new line
26
- new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
27
- current_logger.addHandler(new_handler)
28
- elif type(handler_r) is logging.FileHandler: # if root logger has file handler
29
- new_handler = logging.FileHandler( # create new file handler
30
- handler_r.baseFilename, # with same filename and other properties
31
- handler_r.mode,
32
- handler_r.encoding,
33
- handler_r.delay,
34
- handler_r.errors,
35
- )
36
- new_handler.terminator = "" # This will stop the printing in new line
37
- new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
38
- current_logger.addHandler(new_handler)
39
- else:
40
- continue
41
- current_logger.propagate = False # Don't propagate to root logger
42
- return current_logger
43
-
44
- # Configure the logger
45
- logger =logging.getLogger(__name__)
46
- class FakeStreamingDataGenerator:
47
-
48
- def stream_data(self):
49
- while True:
50
- data = random.randint(0, 100)
51
- yield data
52
- time.sleep(0.5)
53
-
54
- # Example usage:
55
- generator = FakeStreamingDataGenerator()
56
- stream = generator.stream_data()
57
-
58
- logger = setup_logger_behaviour(__name__) # call you set up function here
59
- while True:
60
- chunk = next(stream)
61
- # Replacing print with logger
62
- logger.info(chunk) # Best practice now
@@ -1,34 +0,0 @@
1
- import unittest
2
- from os.path import join
3
-
4
- from bulk_chain.api import iter_content, CWD
5
- from bulk_chain.core.utils import dynamic_init
6
- from utils import current_dir, API_TOKEN
7
-
8
-
9
- class TestAPI(unittest.TestCase):
10
-
11
- llm = dynamic_init(class_dir=join(CWD, ".."),
12
- class_filepath="providers/replicate_104.py",
13
- class_name="Replicate")(api_token=API_TOKEN,
14
- model_name="deepseek-ai/deepseek-r1")
15
-
16
- @staticmethod
17
- def it_data(n):
18
- for i in range(n):
19
- yield {"ind": i, "text": "X invent sanctions against Y", "entity": "X"}
20
-
21
- def test_iter(self):
22
- data_it = iter_content(input_dicts_it=self.it_data(20),
23
- llm=self.llm,
24
- batch_size=1,
25
- handle_missed_value_func=lambda *_: None,
26
- return_mode="batch",
27
- schema=join(current_dir, "schema/default.json"))
28
-
29
- for data in data_it:
30
- print(data)
31
-
32
-
33
- if __name__ == '__main__':
34
- unittest.main()
@@ -1,52 +0,0 @@
1
- import unittest
2
- from os.path import join
3
-
4
- from tqdm import tqdm
5
-
6
- from bulk_chain.api import CWD, iter_content
7
- from bulk_chain.core.utils import dynamic_init
8
- from utils import API_TOKEN, iter_test_jsonl_samples
9
-
10
-
11
- class TestAPI_Streaming(unittest.TestCase):
12
-
13
- llm = dynamic_init(class_dir=join(CWD, ".."),
14
- class_filepath="providers/replicate_104.py",
15
- class_name="Replicate")(api_token=API_TOKEN,
16
- model_name="meta/meta-llama-3-70b-instruct",
17
- stream=True)
18
-
19
- def test_callback_mode(self):
20
-
21
- def callback(chunk, info):
22
- if chunk is None and info["action"] == "start":
23
- print(f"\n{info['param']} (batch_ind={info['ind']}):\n")
24
- return
25
- if chunk is None and info["action"] == "end":
26
- print("\n\n")
27
- return
28
- print(chunk, end="")
29
-
30
- input_dicts_it = iter_test_jsonl_samples()
31
- data_it = iter_content(input_dicts_it=input_dicts_it,
32
- llm=self.llm,
33
- return_batch=False,
34
- callback_stream_func=callback,
35
- handle_missed_value_func=lambda *_: None,
36
- schema="schema/thor_cot_schema.json")
37
-
38
- for _ in tqdm(data_it):
39
- print("\n|NEXT ENTRY|\n")
40
-
41
- def test_content_iter_mode(self):
42
-
43
- input_dicts_it = iter_test_jsonl_samples()
44
- data_it = iter_content(input_dicts_it=input_dicts_it,
45
- llm=self.llm,
46
- batch_size=1,
47
- return_mode="chunk",
48
- handle_missed_value_func=lambda *_: None,
49
- schema="schema/thor_cot_schema.json")
50
-
51
- for ind_in_batch, col, item in data_it:
52
- print("\t".join([str(ind_in_batch), str(col), item]))
@@ -1,26 +0,0 @@
1
- import unittest
2
-
3
- from bulk_chain.core.utils import iter_params
4
-
5
-
6
- class TestArgumentsSeeking(unittest.TestCase):
7
-
8
- def test(self):
9
- params = list(iter_params("X is a {x} and p is {text} and for {k}"))
10
-
11
- line = ",".join(["{{{x}}}".format(x=x) for x in params])
12
- print(line)
13
- d_params = {}
14
- for param in params:
15
- d_params[param] = 2
16
- print(d_params)
17
-
18
- z = line.format(**d_params)
19
- print(z)
20
-
21
- b = list(iter_params("X"))
22
- print(b)
23
-
24
-
25
- if __name__ == '__main__':
26
- unittest.main()
File without changes
File without changes