bulk-chain 1.0.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. bulk_chain-1.2.1/PKG-INFO +134 -0
  2. bulk_chain-1.2.1/README.md +116 -0
  3. bulk_chain-1.2.1/bulk_chain/api.py +229 -0
  4. bulk_chain-1.2.1/bulk_chain/core/llm_base.py +29 -0
  5. bulk_chain-1.2.1/bulk_chain/core/service_asyncio.py +65 -0
  6. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/core/service_batch.py +2 -2
  7. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/core/service_schema.py +5 -1
  8. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/core/utils.py +50 -7
  9. bulk_chain-1.2.1/bulk_chain.egg-info/PKG-INFO +134 -0
  10. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain.egg-info/SOURCES.txt +5 -5
  11. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/setup.py +1 -2
  12. bulk_chain-1.2.1/test/test_api.py +64 -0
  13. bulk_chain-1.0.0/test/test_provider_batching.py → bulk_chain-1.2.1/test/test_api_batching.py +4 -7
  14. bulk_chain-1.2.1/test/test_api_novita.py +34 -0
  15. bulk_chain-1.2.1/test/test_api_streaming.py +20 -0
  16. bulk_chain-1.2.1/test/test_replicate_async_baseline.py +14 -0
  17. bulk_chain-1.2.1/test/test_replicate_async_batch_async.py +40 -0
  18. bulk_chain-1.0.0/LICENSE +0 -21
  19. bulk_chain-1.0.0/PKG-INFO +0 -99
  20. bulk_chain-1.0.0/README.md +0 -80
  21. bulk_chain-1.0.0/bulk_chain/api.py +0 -143
  22. bulk_chain-1.0.0/bulk_chain/core/llm_base.py +0 -50
  23. bulk_chain-1.0.0/bulk_chain.egg-info/PKG-INFO +0 -99
  24. bulk_chain-1.0.0/bulk_chain.egg-info/requires.txt +0 -1
  25. bulk_chain-1.0.0/test/test.py +0 -62
  26. bulk_chain-1.0.0/test/test_api.py +0 -34
  27. bulk_chain-1.0.0/test/test_api_streaming.py +0 -52
  28. bulk_chain-1.0.0/test/test_args_seeking.py +0 -26
  29. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/__init__.py +0 -0
  30. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/core/__init__.py +0 -0
  31. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/core/service_data.py +0 -0
  32. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/core/service_dict.py +0 -0
  33. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain/core/service_json.py +0 -0
  34. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain.egg-info/dependency_links.txt +0 -0
  35. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/bulk_chain.egg-info/top_level.txt +0 -0
  36. {bulk_chain-1.0.0 → bulk_chain-1.2.1}/setup.cfg +0 -0
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.1
2
+ Name: bulk_chain
3
+ Version: 1.2.1
4
+ Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
+ Home-page: https://github.com/nicolay-r/bulk-chain
6
+ Author: Nicolay Rusnachenko
7
+ Author-email: rusnicolay@gmail.com
8
+ License: MIT License
9
+ Description: # bulk-chain 1.2.1
10
+ ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
11
+ [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
12
+ [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
13
+ [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
14
+
15
+ <p align="center">
16
+ <img src="logo.png"/>
17
+ </p>
18
+
19
+ <p align="center">
20
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
21
+ <br>
22
+ <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
23
+ </p>
24
+
25
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
26
+
27
+ ### Main Features
28
+ * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
29
+ * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
30
+ * ✅ **Provides iterator over infinite amount of input contexts**
31
+
32
+ # Installation
33
+
34
+ From PyPI:
35
+
36
+ ```bash
37
+ pip install --no-deps bulk-chain
38
+ ```
39
+
40
+ or latest version from here:
41
+
42
+ ```bash
43
+ pip install git+https://github.com/nicolay-r/bulk-chain@master
44
+ ```
45
+
46
+ ## Chain-of-Thought Schema
47
+
48
+ To declare Chain-of-Though (CoT) schema we use `JSON` format.
49
+ The field `schema` is a list of CoT instructions for the Large Language Model.
50
+ Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
51
+ All the variable names should be mentioned in `{}`.
52
+
53
+ **Example**:
54
+ ```python
55
+ [
56
+ {"prompt": "extract topic: {text}", "out": "topic"},
57
+ {"prompt": "extract subject: {text}", "out": "subject"},
58
+ ]
59
+ ```
60
+
61
+ # Usage
62
+
63
+ ## 🤖 Prepare
64
+
65
+ 1. [schema](#chain-of-thought-schema)
66
+ * [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
67
+ 2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
68
+ 3. Data (iter of dictionaries)
69
+
70
+ ## 🚀 Launch
71
+
72
+ > **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
73
+
74
+ ```python
75
+ from bulk_chain.core.utils import dynamic_init
76
+ from bulk_chain.api import iter_content
77
+
78
+ content_it = iter_content(
79
+ # 1. Your schema.
80
+ schema=[
81
+ {"prompt": "extract topic: {text}", "out": "topic" },
82
+ {"prompt": "extract subject: {text}", "out": "subject"},
83
+ ],
84
+ # 2. Your third-party model implementation.
85
+ llm=dynamic_init(class_filepath="replicate_104.py")(
86
+ api_token="<API-KEY>",
87
+ model_name="meta/meta-llama-3-70b-instruct"),
88
+ # 3. Toggle streaming if needed
89
+ stream=False,
90
+ # 4. Toggle Async API mode usage.
91
+ async_mode=True,
92
+ # 5. Batch size.
93
+ batch_size=10,
94
+ # 6. Your iterator of dictionaries
95
+ input_dicts_it=[
96
+ # Example of data ...
97
+ { "text": "Rocks are hard" },
98
+ { "text": "Water is wet" },
99
+ { "text": "Fire is hot" }
100
+ ],
101
+ )
102
+
103
+ for batch in content_it:
104
+ for entry in batch:
105
+ print(entry)
106
+ ```
107
+
108
+ Outputs entries represent texts augmented with `topic` and `subject`:
109
+ ```jsonl
110
+ {'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
111
+ {'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
112
+ {'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
113
+ ```
114
+
115
+ # API
116
+
117
+ | Method | Mode | Description |
118
+ |----------------------|------------|---------------------------------------------------------------------|
119
+ | `ask(prompt)` | Sync | Infers the model with a single prompt. |
120
+ | `ask_stream(prompt)` | Sync | Returns a generator that yields chunks of the inferred result. |
121
+ | `ask_async(prompt)` | Async | Asynchronously infers the model with a single prompt. |
122
+ | `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result. |
123
+
124
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
125
+
126
+ Keywords: natural language processing,chain-of-thought,reasoning
127
+ Platform: UNKNOWN
128
+ Classifier: Programming Language :: Python
129
+ Classifier: Programming Language :: Python :: 3.9
130
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
131
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
132
+ Classifier: Topic :: Text Processing :: Linguistic
133
+ Requires-Python: >=3.6
134
+ Description-Content-Type: text/markdown
@@ -0,0 +1,116 @@
1
+ # bulk-chain 1.2.1
2
+ ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
3
+ [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
4
+ [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
5
+ [![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
6
+
7
+ <p align="center">
8
+ <img src="logo.png"/>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
13
+ <br>
14
+ <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
15
+ </p>
16
+
17
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
18
+
19
+ ### Main Features
20
+ * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
21
+ * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
22
+ * ✅ **Provides iterator over infinite amount of input contexts**
23
+
24
+ # Installation
25
+
26
+ From PyPI:
27
+
28
+ ```bash
29
+ pip install --no-deps bulk-chain
30
+ ```
31
+
32
+ or latest version from here:
33
+
34
+ ```bash
35
+ pip install git+https://github.com/nicolay-r/bulk-chain@master
36
+ ```
37
+
38
+ ## Chain-of-Thought Schema
39
+
40
+ To declare Chain-of-Though (CoT) schema we use `JSON` format.
41
+ The field `schema` is a list of CoT instructions for the Large Language Model.
42
+ Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
43
+ All the variable names should be mentioned in `{}`.
44
+
45
+ **Example**:
46
+ ```python
47
+ [
48
+ {"prompt": "extract topic: {text}", "out": "topic"},
49
+ {"prompt": "extract subject: {text}", "out": "subject"},
50
+ ]
51
+ ```
52
+
53
+ # Usage
54
+
55
+ ## 🤖 Prepare
56
+
57
+ 1. [schema](#chain-of-thought-schema)
58
+ * [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
59
+ 2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
60
+ 3. Data (iter of dictionaries)
61
+
62
+ ## 🚀 Launch
63
+
64
+ > **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
65
+
66
+ ```python
67
+ from bulk_chain.core.utils import dynamic_init
68
+ from bulk_chain.api import iter_content
69
+
70
+ content_it = iter_content(
71
+ # 1. Your schema.
72
+ schema=[
73
+ {"prompt": "extract topic: {text}", "out": "topic" },
74
+ {"prompt": "extract subject: {text}", "out": "subject"},
75
+ ],
76
+ # 2. Your third-party model implementation.
77
+ llm=dynamic_init(class_filepath="replicate_104.py")(
78
+ api_token="<API-KEY>",
79
+ model_name="meta/meta-llama-3-70b-instruct"),
80
+ # 3. Toggle streaming if needed
81
+ stream=False,
82
+ # 4. Toggle Async API mode usage.
83
+ async_mode=True,
84
+ # 5. Batch size.
85
+ batch_size=10,
86
+ # 6. Your iterator of dictionaries
87
+ input_dicts_it=[
88
+ # Example of data ...
89
+ { "text": "Rocks are hard" },
90
+ { "text": "Water is wet" },
91
+ { "text": "Fire is hot" }
92
+ ],
93
+ )
94
+
95
+ for batch in content_it:
96
+ for entry in batch:
97
+ print(entry)
98
+ ```
99
+
100
+ Outputs entries represent texts augmented with `topic` and `subject`:
101
+ ```jsonl
102
+ {'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
103
+ {'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
104
+ {'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
105
+ ```
106
+
107
+ # API
108
+
109
+ | Method | Mode | Description |
110
+ |----------------------|------------|---------------------------------------------------------------------|
111
+ | `ask(prompt)` | Sync | Infers the model with a single prompt. |
112
+ | `ask_stream(prompt)` | Sync | Returns a generator that yields chunks of the inferred result. |
113
+ | `ask_async(prompt)` | Async | Asynchronously infers the model with a single prompt. |
114
+ | `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result. |
115
+
116
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -0,0 +1,229 @@
1
+ import asyncio
2
+ import collections
3
+ import logging
4
+ import os
5
+ from itertools import chain
6
+ from types import AsyncGeneratorType
7
+
8
+ from bulk_chain.core.llm_base import BaseLM
9
+ from bulk_chain.core.service_asyncio import AsyncioService
10
+ from bulk_chain.core.service_batch import BatchIterator
11
+ from bulk_chain.core.service_data import DataService
12
+ from bulk_chain.core.service_dict import DictionaryService
13
+ from bulk_chain.core.service_json import JsonService
14
+ from bulk_chain.core.service_schema import SchemaService
15
+ from bulk_chain.core.utils import attempt_wrapper
16
+
17
+
18
+ INFER_MODES = {
19
+ "single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
20
+ "batch": lambda llm, batch, **kwargs: llm.ask_batch(batch),
21
+ "single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
22
+ "batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
23
+ batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
24
+ ),
25
+ "batch_stream_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
26
+ batch=batch, async_handler=llm.ask_stream_async, event_loop=kwargs.get("event_loop")
27
+ ),
28
+ }
29
+
30
+
31
+ CWD = os.getcwd()
32
+
33
+
34
+ def _iter_batch_prompts(c, batch_content_it, **kwargs):
35
+ for ind_in_batch, entry in enumerate(batch_content_it):
36
+ content = DataService.get_prompt_text(
37
+ prompt=entry[c]["prompt"],
38
+ data_dict=entry,
39
+ handle_missed_func=kwargs["handle_missed_value_func"])
40
+ yield ind_in_batch, content
41
+
42
+
43
+ def __handle_agen_to_gen(handle, batch, event_loop):
44
+ """ This handler provides conversion of the async generator to generator (sync).
45
+ """
46
+
47
+ def __wrap_with_index(async_gens):
48
+ async def wrapper(index, agen):
49
+ async for item in agen:
50
+ yield index, item
51
+ return [wrapper(i, agen) for i, agen in enumerate(async_gens)]
52
+
53
+ agen_list = handle(batch, event_loop=event_loop)
54
+
55
+ it = AsyncioService.async_gen_to_iter(
56
+ gen=AsyncioService.merge_generators(*__wrap_with_index(agen_list)),
57
+ loop=event_loop)
58
+
59
+ for ind_in_batch, chunk in it:
60
+ yield ind_in_batch, str(chunk)
61
+
62
+
63
+ def __handle_gen(handle, batch, event_loop):
64
+ """ This handler deals with the iteration of each individual element of the batch.
65
+ """
66
+
67
+ def _iter_entry_content(entry):
68
+ if isinstance(entry, str):
69
+ yield entry
70
+ elif isinstance(entry, collections.abc.Iterable):
71
+ for chunk in map(lambda item: str(item), entry):
72
+ yield chunk
73
+ elif isinstance(entry, AsyncGeneratorType):
74
+ for chunk in AsyncioService.async_gen_to_iter(entry, loop=event_loop):
75
+ yield str(chunk)
76
+ else:
77
+ raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
78
+
79
+ for ind_in_batch, entry in enumerate(handle(batch, event_loop=event_loop)):
80
+ for chunk in _iter_entry_content(entry=entry):
81
+ yield ind_in_batch, chunk
82
+
83
+
84
+ def _iter_chunks(p_column, batch_content_it, **kwargs):
85
+ handler = __handle_agen_to_gen if kwargs["infer_mode"] == "batch_stream_async" else __handle_gen
86
+ p_batch = [item[p_column] for item in batch_content_it]
87
+ it = handler(handle=kwargs["handle_batch_func"], batch=p_batch, event_loop=kwargs["event_loop"])
88
+ for ind_in_batch, chunk in it:
89
+ yield ind_in_batch, chunk
90
+
91
+
92
+ def _column_ordered_chunks_iter(batch, schema, cols=None, keep_prompts=True, **kwargs):
93
+ """
94
+ NOTE: we populate `batch` content automatically
95
+ """
96
+ assert (isinstance(batch, list))
97
+
98
+ if len(batch) == 0:
99
+ return
100
+
101
+ if cols is None:
102
+ first_item = batch[0]
103
+ cols = list(first_item.keys()) if cols is None else cols
104
+
105
+ for c in cols:
106
+
107
+ # Handling prompt column.
108
+ if c in schema.p2r:
109
+ content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
110
+ for ind_in_batch, prompt in content_it:
111
+ batch[ind_in_batch][c] = prompt
112
+
113
+ # Handling column for inference.
114
+ if c in schema.r2p:
115
+ content_it = _iter_chunks(p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
116
+ # Register values.
117
+ for item in batch:
118
+ item[c] = []
119
+ for ind_in_batch, chunk in content_it:
120
+ # Append batch.
121
+ batch[ind_in_batch][c].append(chunk)
122
+ yield [ind_in_batch, c, chunk]
123
+
124
+ # Convert content to string.
125
+ for item in batch:
126
+ item[c] = "".join(item[c])
127
+
128
+ if not keep_prompts:
129
+ for batch_item in batch:
130
+ for key in list(batch_item.keys()):
131
+ prompt_col = SchemaService.col_to_prompt(col_name=key, prompt_data=batch_item)
132
+ if prompt_col in batch_item:
133
+ del batch_item[prompt_col]
134
+
135
+
136
+ def _infer_batch(return_type, batch, batch_ind, **kwargs):
137
+ assert (return_type in ["batch", "chunk", "record"])
138
+
139
+ # Filling batch with inference content.
140
+ for ind_in_batch, column, chunk in _column_ordered_chunks_iter(batch=batch, **kwargs):
141
+ if return_type == "chunk":
142
+ global_ind = batch_ind * len(batch) + ind_in_batch
143
+ yield [global_ind, column, chunk]
144
+
145
+ if return_type == "record":
146
+ for record in batch:
147
+ yield record
148
+
149
+ if return_type == "batch":
150
+ yield batch
151
+
152
+
153
+ def get_infer_mode(stream, batch_size, async_mode):
154
+ if not stream and batch_size == 1:
155
+ return 'single', 'record'
156
+ elif not stream and batch_size > 1:
157
+ if async_mode:
158
+ return 'batch_async', 'batch'
159
+ else:
160
+ return 'batch', 'batch'
161
+ elif stream and batch_size == 1:
162
+ return 'single_stream', 'chunk'
163
+ elif stream and batch_size > 1:
164
+ return 'batch_stream_async', 'chunk'
165
+
166
+ raise ValueError(f"Invalid combination of stream and batch_size: {stream}, {batch_size}")
167
+
168
+
169
+ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
170
+ stream=False, async_mode=False, attempts=1, event_loop=None,
171
+ handle_missed_value_func=lambda *_: None, **kwargs):
172
+ """ This method represent Python API aimed at application of `llm` towards
173
+ iterator of input_dicts via cache_target that refers to the SQLite using
174
+ the given `schema`
175
+ """
176
+ assert (isinstance(llm, BaseLM))
177
+ assert (isinstance(batch_size, int) and batch_size > 0)
178
+ assert (isinstance(async_mode, bool))
179
+
180
+ infer_type, return_type = get_infer_mode(stream=stream, batch_size=batch_size, async_mode=async_mode)
181
+ infer_mode = INFER_MODES[infer_type]
182
+
183
+ # Setup event loop.
184
+ event_loop = asyncio.get_event_loop_policy().get_event_loop() \
185
+ if event_loop is None else event_loop
186
+
187
+ # Quick initialization of the schema.
188
+ if isinstance(schema, str):
189
+ schema = JsonService.read(schema)
190
+ if isinstance(schema, dict):
191
+ schema = SchemaService(json_data=schema)
192
+ if isinstance(schema, list):
193
+ schema = SchemaService(json_data={"schema": schema})
194
+
195
+ prompts_it = map(
196
+ lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
197
+ input_dicts_it
198
+ )
199
+
200
+ handle_batch_func = lambda batch, **handle_kwargs: infer_mode(
201
+ llm,
202
+ DataService.limit_prompts(batch, limit=limit_prompt),
203
+ **handle_kwargs
204
+ )
205
+
206
+ # Optional wrapping into attempts.
207
+ if attempts > 1:
208
+ # Optional setup of the logger.
209
+ logger = logging.getLogger(__name__)
210
+ logging.basicConfig(level=logging.INFO)
211
+
212
+ attempt_dec = attempt_wrapper(attempts=attempts,
213
+ delay_sec=kwargs.get("attempt_delay_sec", 1),
214
+ logger=logger)
215
+ handle_batch_func = attempt_dec(handle_batch_func)
216
+
217
+ kwargs["handle_missed_value_func"] = handle_missed_value_func
218
+
219
+ content_it = (_infer_batch(return_type=return_type,
220
+ batch=batch,
221
+ batch_ind=batch_ind,
222
+ infer_mode=infer_mode,
223
+ handle_batch_func=handle_batch_func,
224
+ schema=schema,
225
+ event_loop=event_loop,
226
+ **kwargs)
227
+ for batch_ind, batch in enumerate(BatchIterator(prompts_it, batch_size=batch_size)))
228
+
229
+ yield from chain.from_iterable(content_it)
@@ -0,0 +1,29 @@
1
+ class BaseLM(object):
2
+
3
+ def __init__(self, **kwargs):
4
+ pass
5
+
6
+ def ask(self, prompt):
7
+ """ Assumes to return str.
8
+ """
9
+ raise NotImplemented()
10
+
11
+ def ask_batch(self, batch):
12
+ """ Assumes to return generator.
13
+ """
14
+ raise NotImplemented()
15
+
16
+ def ask_stream(self, prompt):
17
+ """ Assumes to return generator.
18
+ """
19
+ raise NotImplemented()
20
+
21
+ async def ask_async(self, prompt):
22
+ """ Assumes to return co-routine.
23
+ """
24
+ raise NotImplemented()
25
+
26
+ async def ask_stream_async(self, prompt):
27
+ """ Assumes to return AsyncGenerator.
28
+ """
29
+ raise NotImplemented()
@@ -0,0 +1,65 @@
1
+ import asyncio
2
+ from typing import AsyncGenerator, Any
3
+
4
+
5
+ class AsyncioService:
6
+
7
+ @staticmethod
8
+ async def _run_tasks_async(batch, async_handler):
9
+ tasks = [async_handler(prompt) for prompt in batch]
10
+ return await asyncio.gather(*tasks)
11
+
12
+ @staticmethod
13
+ async def _run_generator(gen, output_queue, idx):
14
+ try:
15
+ async for item in gen:
16
+ await output_queue.put((idx, item))
17
+ finally:
18
+ await output_queue.put((idx, StopAsyncIteration))
19
+
20
+
21
+ @staticmethod
22
+ def run_tasks(event_loop, **tasks_kwargs):
23
+ return event_loop.run_until_complete(AsyncioService._run_tasks_async(**tasks_kwargs))
24
+
25
+ @staticmethod
26
+ async def merge_generators(*gens: AsyncGenerator[Any, None]) -> AsyncGenerator[Any, None]:
27
+
28
+ output_queue = asyncio.Queue()
29
+ tasks = [
30
+ asyncio.create_task(AsyncioService._run_generator(gen, output_queue, idx))
31
+ for idx, gen in enumerate(gens)
32
+ ]
33
+
34
+ finished = set()
35
+ while len(finished) < len(tasks):
36
+ idx, item = await output_queue.get()
37
+ if item is StopAsyncIteration:
38
+ finished.add(idx)
39
+ else:
40
+ yield item
41
+
42
+ for task in tasks:
43
+ task.cancel()
44
+
45
+ @staticmethod
46
+ def async_gen_to_iter(gen, loop=None):
47
+ """ This approach is limited. Could be considered as legacy.
48
+ https://stackoverflow.com/questions/71580727/translating-async-generator-into-sync-one/78573267#78573267
49
+ """
50
+
51
+ loop_created = False
52
+ if loop is None:
53
+ loop_created = True
54
+ loop = asyncio.new_event_loop()
55
+
56
+ asyncio.set_event_loop(loop)
57
+ try:
58
+ while True:
59
+ try:
60
+ yield loop.run_until_complete(gen.__anext__())
61
+ except StopAsyncIteration:
62
+ break
63
+ finally:
64
+ if loop_created:
65
+ loop.close()
@@ -1,8 +1,8 @@
1
1
  class BatchIterator:
2
2
 
3
3
  def __init__(self, data_iter, batch_size, end_value=None, filter_func=None):
4
- assert(isinstance(batch_size, int) and batch_size > 0)
5
- assert(callable(end_value) or end_value is None)
4
+ assert (isinstance(batch_size, int) and batch_size > 0)
5
+ assert (callable(end_value) or end_value is None)
6
6
  self.__data_iter = data_iter
7
7
  self.__index = 0
8
8
  self.__batch_size = batch_size
@@ -9,6 +9,10 @@ class SchemaService(object):
9
9
  prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
10
10
  return cls(prompt_schema)
11
11
 
12
+ @staticmethod
13
+ def col_to_prompt(col_name, prompt_data):
14
+ return col_name + "_prompt" if "in" not in prompt_data else prompt_data["in"]
15
+
12
16
  @staticmethod
13
17
  def __init_schema(prompts):
14
18
 
@@ -19,7 +23,7 @@ class SchemaService(object):
19
23
 
20
24
  for prompt in prompts:
21
25
  r_col_name = prompt["out"]
22
- p_col_name = r_col_name + "_prompt" if "in" not in prompt else prompt["in"]
26
+ p_col_name = SchemaService.col_to_prompt(col_name=r_col_name, prompt_data=prompt)
23
27
 
24
28
  assert r_col_name not in schema_r2p, f"`{r_col_name}` has been already declared!"
25
29
  assert p_col_name not in schema_p2r, f"`{p_col_name}` has been already declared!"