bulk-chain 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bulk_chain/api.py CHANGED
@@ -3,6 +3,7 @@ import collections
3
3
  import logging
4
4
  import os
5
5
  from itertools import chain
6
+ from types import AsyncGeneratorType
6
7
 
7
8
  from bulk_chain.core.llm_base import BaseLM
8
9
  from bulk_chain.core.service_asyncio import AsyncioService
@@ -16,8 +17,8 @@ from bulk_chain.core.utils import attempt_wrapper
16
17
 
17
18
  INFER_MODES = {
18
19
  "single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
20
+ "batch": lambda llm, batch, **kwargs: llm.ask_batch(batch),
19
21
  "single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
20
- "batch": lambda llm, batch, **kwargs: llm.ask(batch),
21
22
  "batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
22
23
  batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
23
24
  ),
@@ -69,6 +70,9 @@ def __handle_gen(handle, batch, event_loop):
69
70
  elif isinstance(entry, collections.abc.Iterable):
70
71
  for chunk in map(lambda item: str(item), entry):
71
72
  yield chunk
73
+ elif isinstance(entry, AsyncGeneratorType):
74
+ for chunk in AsyncioService.async_gen_to_iter(entry, loop=event_loop):
75
+ yield str(chunk)
72
76
  else:
73
77
  raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
74
78
 
@@ -85,11 +89,14 @@ def _iter_chunks(p_column, batch_content_it, **kwargs):
85
89
  yield ind_in_batch, chunk
86
90
 
87
91
 
88
- def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
92
+ def _column_ordered_chunks_iter(batch, schema, cols=None, keep_prompts=True, **kwargs):
93
+ """
94
+ NOTE: we populate `batch` content automatically
95
+ """
89
96
  assert (isinstance(batch, list))
90
97
 
91
98
  if len(batch) == 0:
92
- return batch
99
+ return
93
100
 
94
101
  if cols is None:
95
102
  first_item = batch[0]
@@ -112,33 +119,66 @@ def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
112
119
  for ind_in_batch, chunk in content_it:
113
120
  # Append batch.
114
121
  batch[ind_in_batch][c].append(chunk)
115
- # Returning (optional).
116
- if return_mode == "chunk":
117
- global_ind = batch_ind * len(batch) + ind_in_batch
118
- yield [global_ind, c, chunk]
122
+ yield [ind_in_batch, c, chunk]
119
123
 
120
124
  # Convert content to string.
121
125
  for item in batch:
122
126
  item[c] = "".join(item[c])
123
127
 
124
- if return_mode == "record":
128
+ if not keep_prompts:
129
+ for batch_item in batch:
130
+ for key in list(batch_item.keys()):
131
+ prompt_col = SchemaService.col_to_prompt(col_name=key, prompt_data=batch_item)
132
+ if prompt_col in batch_item:
133
+ del batch_item[prompt_col]
134
+
135
+
136
+ def _infer_batch(return_type, batch, batch_ind, **kwargs):
137
+ assert (return_type in ["batch", "chunk", "record"])
138
+
139
+ # Filling batch with inference content.
140
+ for ind_in_batch, column, chunk in _column_ordered_chunks_iter(batch=batch, **kwargs):
141
+ if return_type == "chunk":
142
+ global_ind = batch_ind * len(batch) + ind_in_batch
143
+ yield [global_ind, column, chunk]
144
+
145
+ if return_type == "record":
125
146
  for record in batch:
126
147
  yield record
127
148
 
128
- if return_mode == "batch":
149
+ if return_type == "batch":
129
150
  yield batch
130
151
 
131
152
 
153
+ def get_infer_mode(stream, batch_size, async_mode):
154
+ if not stream and batch_size == 1:
155
+ return 'single', 'record'
156
+ elif not stream and batch_size > 1:
157
+ if async_mode:
158
+ return 'batch_async', 'batch'
159
+ else:
160
+ return 'batch', 'batch'
161
+ elif stream and batch_size == 1:
162
+ return 'single_stream', 'chunk'
163
+ elif stream and batch_size > 1:
164
+ return 'batch_stream_async', 'chunk'
165
+
166
+ raise ValueError(f"Invalid combination of stream and batch_size: {stream}, {batch_size}")
167
+
168
+
132
169
  def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
133
- infer_mode="batch", return_mode="batch", attempts=1, event_loop=None,
134
- **kwargs):
170
+ stream=False, async_mode=False, attempts=1, event_loop=None,
171
+ handle_missed_value_func=lambda *_: None, **kwargs):
135
172
  """ This method represent Python API aimed at application of `llm` towards
136
173
  iterator of input_dicts via cache_target that refers to the SQLite using
137
174
  the given `schema`
138
175
  """
139
- assert (infer_mode in INFER_MODES.keys())
140
- assert (return_mode in ["batch", "chunk", "record"])
141
176
  assert (isinstance(llm, BaseLM))
177
+ assert (isinstance(batch_size, int) and batch_size > 0)
178
+ assert (isinstance(async_mode, bool))
179
+
180
+ infer_type, return_type = get_infer_mode(stream=stream, batch_size=batch_size, async_mode=async_mode)
181
+ infer_mode = INFER_MODES[infer_type]
142
182
 
143
183
  # Setup event loop.
144
184
  event_loop = asyncio.get_event_loop_policy().get_event_loop() \
@@ -149,13 +189,15 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
149
189
  schema = JsonService.read(schema)
150
190
  if isinstance(schema, dict):
151
191
  schema = SchemaService(json_data=schema)
192
+ if isinstance(schema, list):
193
+ schema = SchemaService(json_data={"schema": schema})
152
194
 
153
195
  prompts_it = map(
154
196
  lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
155
197
  input_dicts_it
156
198
  )
157
199
 
158
- handle_batch_func = lambda batch, **handle_kwargs: INFER_MODES[infer_mode](
200
+ handle_batch_func = lambda batch, **handle_kwargs: infer_mode(
159
201
  llm,
160
202
  DataService.limit_prompts(batch, limit=limit_prompt),
161
203
  **handle_kwargs
@@ -172,12 +214,13 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
172
214
  logger=logger)
173
215
  handle_batch_func = attempt_dec(handle_batch_func)
174
216
 
175
- content_it = (_infer_batch(batch=batch,
217
+ kwargs["handle_missed_value_func"] = handle_missed_value_func
218
+
219
+ content_it = (_infer_batch(return_type=return_type,
220
+ batch=batch,
176
221
  batch_ind=batch_ind,
177
222
  infer_mode=infer_mode,
178
223
  handle_batch_func=handle_batch_func,
179
- handle_missed_value_func=lambda *_: None,
180
- return_mode=return_mode,
181
224
  schema=schema,
182
225
  event_loop=event_loop,
183
226
  **kwargs)
@@ -3,12 +3,17 @@ class BaseLM(object):
3
3
  def __init__(self, **kwargs):
4
4
  pass
5
5
 
6
- def ask(self, content):
6
+ def ask(self, prompt):
7
7
  """ Assumes to return str.
8
8
  """
9
9
  raise NotImplemented()
10
10
 
11
- def ask_stream(self, content):
11
+ def ask_batch(self, batch):
12
+ """ Assumes to return generator.
13
+ """
14
+ raise NotImplemented()
15
+
16
+ def ask_stream(self, prompt):
12
17
  """ Assumes to return generator.
13
18
  """
14
19
  raise NotImplemented()
@@ -18,7 +23,7 @@ class BaseLM(object):
18
23
  """
19
24
  raise NotImplemented()
20
25
 
21
- async def ask_stream_async(self, batch):
26
+ async def ask_stream_async(self, prompt):
22
27
  """ Assumes to return AsyncGenerator.
23
28
  """
24
29
  raise NotImplemented()
@@ -9,6 +9,10 @@ class SchemaService(object):
9
9
  prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
10
10
  return cls(prompt_schema)
11
11
 
12
+ @staticmethod
13
+ def col_to_prompt(col_name, prompt_data):
14
+ return col_name + "_prompt" if "in" not in prompt_data else prompt_data["in"]
15
+
12
16
  @staticmethod
13
17
  def __init_schema(prompts):
14
18
 
@@ -19,7 +23,7 @@ class SchemaService(object):
19
23
 
20
24
  for prompt in prompts:
21
25
  r_col_name = prompt["out"]
22
- p_col_name = r_col_name + "_prompt" if "in" not in prompt else prompt["in"]
26
+ p_col_name = SchemaService.col_to_prompt(col_name=r_col_name, prompt_data=prompt)
23
27
 
24
28
  assert r_col_name not in schema_r2p, f"`{r_col_name}` has been already declared!"
25
29
  assert p_col_name not in schema_p2r, f"`{p_col_name}` has been already declared!"
bulk_chain/core/utils.py CHANGED
@@ -1,3 +1,4 @@
1
+ import ast
1
2
  import importlib
2
3
  import logging
3
4
  import sys
@@ -35,18 +36,30 @@ def find_by_prefix(d, key):
35
36
  return d[matches[0]]
36
37
 
37
38
 
39
+ def check_is_param_name(param_name):
40
+ return param_name.replace("_", "").isalpha()
41
+
42
+
38
43
  def iter_params(text):
39
44
  assert(isinstance(text, str))
40
45
  beg = 0
41
46
  while beg < len(text):
47
+ print(beg)
42
48
  try:
43
49
  pb = text.index('{', beg)
44
50
  except ValueError:
45
51
  break
46
- pe = text.index('}', beg+1)
47
- # Yield argument.
48
- yield text[pb+1:pe]
49
- beg = pe+1
52
+ pe = text.index('}', pb+1)
53
+ param_name = text[pb + 1:pe]
54
+
55
+ # Check parameter validity.
56
+ if not check_is_param_name(param_name):
57
+ beg = pb + 1
58
+ continue
59
+
60
+ # Passing.
61
+ yield param_name
62
+ beg = pe + 1
50
63
 
51
64
 
52
65
  def auto_import(name, is_class=False):
@@ -61,6 +74,17 @@ def auto_import(name, is_class=False):
61
74
  return m() if is_class else m
62
75
 
63
76
 
77
+ def get_class_name(file_path):
78
+ with open(file_path, 'r') as f:
79
+ tree = ast.parse(f.read(), filename=file_path)
80
+
81
+ for node in ast.walk(tree):
82
+ if isinstance(node, ast.ClassDef):
83
+ return node.name
84
+
85
+ return None
86
+
87
+
64
88
  def dynamic_init(class_filepath, class_name=None):
65
89
 
66
90
  # Registering path.
@@ -75,7 +99,7 @@ def dynamic_init(class_filepath, class_name=None):
75
99
  class_filename = class_filename[:-len(".py")]
76
100
 
77
101
  # Loading library.
78
- class_name = class_path_list[-1].title() if class_name is None else class_name
102
+ class_name = get_class_name(class_filepath) if class_name is None else class_name
79
103
  class_path = ".".join([class_filename, class_name])
80
104
  logger.info(f"Dynamic loading for the file and class `{class_path}`")
81
105
  cls = auto_import(class_path, is_class=False)
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.1
2
- Name: bulk_chain
3
- Version: 1.1.0
2
+ Name: bulk-chain
3
+ Version: 1.2.1
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
7
7
  Author-email: rusnicolay@gmail.com
8
8
  License: MIT License
9
9
  Keywords: natural language processing,chain-of-thought,reasoning
10
+ Platform: UNKNOWN
10
11
  Classifier: Programming Language :: Python
11
12
  Classifier: Programming Language :: Python :: 3.9
12
13
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
@@ -14,9 +15,8 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
14
15
  Classifier: Topic :: Text Processing :: Linguistic
15
16
  Requires-Python: >=3.6
16
17
  Description-Content-Type: text/markdown
17
- License-File: LICENSE
18
18
 
19
- # bulk-chain 1.1.0
19
+ # bulk-chain 1.2.1
20
20
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
21
21
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
22
22
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -55,24 +55,17 @@ pip install git+https://github.com/nicolay-r/bulk-chain@master
55
55
 
56
56
  ## Chain-of-Thought Schema
57
57
 
58
- To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
59
- This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
60
-
61
- Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
62
- All the variable names are expected to be mentioned in `{}`.
63
-
64
- Below, is an example on how to declare your own schema:
58
+ To declare Chain-of-Though (CoT) schema we use `JSON` format.
59
+ The field `schema` is a list of CoT instructions for the Large Language Model.
60
+ Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
61
+ All the variable names should be mentioned in `{}`.
65
62
 
63
+ **Example**:
66
64
  ```python
67
- {
68
- "name": "schema-name",
69
- "schema": [
70
- {"prompt": "Given the question '{text}', let's think step-by-step.",
71
- "out": "steps"},
72
- {"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
73
- "out": "answer"},
65
+ [
66
+ {"prompt": "extract topic: {text}", "out": "topic"},
67
+ {"prompt": "extract subject: {text}", "out": "subject"},
74
68
  ]
75
- }
76
69
  ```
77
70
 
78
71
  # Usage
@@ -94,25 +87,50 @@ from bulk_chain.api import iter_content
94
87
 
95
88
  content_it = iter_content(
96
89
  # 1. Your schema.
97
- schema="YOUR_SCHEMA.json",
90
+ schema=[
91
+ {"prompt": "extract topic: {text}", "out": "topic" },
92
+ {"prompt": "extract subject: {text}", "out": "subject"},
93
+ ],
98
94
  # 2. Your third-party model implementation.
99
- llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
100
- # 3. Customize your inference and result providing modes:
101
- infer_mode="batch_async",
102
- return_mode="batch",
103
- # 4. Your iterator of dictionaries
104
- input_dicts_it=YOUR_DATA_IT,
95
+ llm=dynamic_init(class_filepath="replicate_104.py")(
96
+ api_token="<API-KEY>",
97
+ model_name="meta/meta-llama-3-70b-instruct"),
98
+ # 3. Toggle streaming if needed
99
+ stream=False,
100
+ # 4. Toggle Async API mode usage.
101
+ async_mode=True,
102
+ # 5. Batch size.
103
+ batch_size=10,
104
+ # 6. Your iterator of dictionaries
105
+ input_dicts_it=[
106
+ # Example of data ...
107
+ { "text": "Rocks are hard" },
108
+ { "text": "Water is wet" },
109
+ { "text": "Fire is hot" }
110
+ ],
105
111
  )
106
-
107
- for content in content_it:
108
- # Handle your LLM responses here ...
112
+
113
+ for batch in content_it:
114
+ for entry in batch:
115
+ print(entry)
109
116
  ```
110
117
 
118
+ Outputs entries represent texts augmented with `topic` and `subject`:
119
+ ```jsonl
120
+ {'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
121
+ {'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
122
+ {'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
123
+ ```
111
124
 
112
- # Embed your LLM
125
+ # API
113
126
 
114
- All you have to do is to implement `BaseLM` class, that includes:
115
- * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
116
- * `ask(prompt)` -- infer your model with the given `prompt`.
127
+ | Method | Mode | Description |
128
+ |----------------------|------------|---------------------------------------------------------------------|
129
+ | `ask(prompt)` | Sync | Infers the model with a single prompt. |
130
+ | `ask_stream(prompt)` | Sync | Returns a generator that yields chunks of the inferred result. |
131
+ | `ask_async(prompt)` | Async | Asynchronously infers the model with a single prompt. |
132
+ | `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result. |
117
133
 
118
134
  See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
135
+
136
+
@@ -0,0 +1,16 @@
1
+ bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ bulk_chain/api.py,sha256=bLZXdp58i6LDayZQxRBxsFK4lVT8cZZn1uOY0iaZ5TE,8500
3
+ bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ bulk_chain/core/llm_base.py,sha256=H2KmCqChKp9sKOkROE-4zjMRCxizT9xWvNZSF22HeFU,673
5
+ bulk_chain/core/service_asyncio.py,sha256=S-D4K3LBa3noKTm0tXazluYVI8cBgN1IB6v6MFoMyNQ,1972
6
+ bulk_chain/core/service_batch.py,sha256=lWmjO0aU6h2rmfx_kGmNqt0Rdeaf2a4Dn5VyfKFkfDs,1033
7
+ bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
8
+ bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
9
+ bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
10
+ bulk_chain/core/service_schema.py,sha256=YAsdm3N2G4-eTpeJazg4Y-KQ2w9bEPpqreVl8a-M7H0,1311
11
+ bulk_chain/core/utils.py,sha256=hml0zLmnZe865gvc1CagEzRE19Gdh1pF8kx_KueDY3A,3667
12
+ bulk_chain-1.2.1.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
13
+ bulk_chain-1.2.1.dist-info/METADATA,sha256=xx1vcG6wkHzh_Ga3iZJV3MBdR97RBGpCf7JO5_lonN0,5339
14
+ bulk_chain-1.2.1.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
15
+ bulk_chain-1.2.1.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
16
+ bulk_chain-1.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: bdist_wheel (0.34.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,16 +0,0 @@
1
- bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- bulk_chain/api.py,sha256=gPGjaHYIn2Ewn6yXIXER-CM5SgXQ3ZJH-SdRyaPDOo0,6890
3
- bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- bulk_chain/core/llm_base.py,sha256=aa73TGW03yLXMHY4b_1NgquRvP0CzH8IWZkcFPABFUg,557
5
- bulk_chain/core/service_asyncio.py,sha256=S-D4K3LBa3noKTm0tXazluYVI8cBgN1IB6v6MFoMyNQ,1972
6
- bulk_chain/core/service_batch.py,sha256=lWmjO0aU6h2rmfx_kGmNqt0Rdeaf2a4Dn5VyfKFkfDs,1033
7
- bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
8
- bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
9
- bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
10
- bulk_chain/core/service_schema.py,sha256=KIP4n0Tz2h1i7SIMGhgAhoiCgUFXOT1rzMt38yACS2U,1154
11
- bulk_chain/core/utils.py,sha256=tp1FJQBmJt-3QmG7B0hyJNTFyg_8BwTTdl8xTxSgNDk,3140
12
- bulk_chain-1.1.0.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
13
- bulk_chain-1.1.0.dist-info/METADATA,sha256=EheCGDisKF0TwmzJfnDxW-rgsDVPNpCYGOvuaDn91tw,4428
14
- bulk_chain-1.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
15
- bulk_chain-1.1.0.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
16
- bulk_chain-1.1.0.dist-info/RECORD,,