PyPI - bulk-chain - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

bulk-chain 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

bulk_chain/api.py +60 -17
bulk_chain/core/llm_base.py +8 -3
bulk_chain/core/service_schema.py +5 -1
bulk_chain/core/utils.py +29 -5
{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/METADATA +51 -33
bulk_chain-1.2.1.dist-info/RECORD +16 -0
{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/WHEEL +1 -1
bulk_chain-1.1.0.dist-info/RECORD +0 -16
{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/LICENSE +0 -0
{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/top_level.txt +0 -0

bulk_chain/api.py CHANGED Viewed

@@ -3,6 +3,7 @@ import collections
 import logging
 import os
 from itertools import chain
+from types import AsyncGeneratorType
 from bulk_chain.core.llm_base import BaseLM
 from bulk_chain.core.service_asyncio import AsyncioService
@@ -16,8 +17,8 @@ from bulk_chain.core.utils import attempt_wrapper
 INFER_MODES = {
     "single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
+    "batch": lambda llm, batch, **kwargs: llm.ask_batch(batch),
     "single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
-    "batch": lambda llm, batch, **kwargs: llm.ask(batch),
     "batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
         batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
     ),
@@ -69,6 +70,9 @@ def __handle_gen(handle, batch, event_loop):
         elif isinstance(entry, collections.abc.Iterable):
             for chunk in map(lambda item: str(item), entry):
                 yield chunk
+        elif isinstance(entry, AsyncGeneratorType):
+            for chunk in AsyncioService.async_gen_to_iter(entry, loop=event_loop):
+                yield str(chunk)
         else:
             raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
@@ -85,11 +89,14 @@ def _iter_chunks(p_column, batch_content_it, **kwargs):
         yield ind_in_batch, chunk
-def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
+def _column_ordered_chunks_iter(batch, schema, cols=None, keep_prompts=True, **kwargs):
+    """
+    NOTE: we populate `batch` content automatically
+    """
     assert (isinstance(batch, list))
     if len(batch) == 0:
-        return batch
+        return
     if cols is None:
         first_item = batch[0]
@@ -112,33 +119,66 @@ def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
             for ind_in_batch, chunk in content_it:
                 # Append batch.
                 batch[ind_in_batch][c].append(chunk)
-                # Returning (optional).
-                if return_mode == "chunk":
-                    global_ind = batch_ind * len(batch) + ind_in_batch
-                    yield [global_ind, c, chunk]
+                yield [ind_in_batch, c, chunk]
             # Convert content to string.
             for item in batch:
                 item[c] = "".join(item[c])
-    if return_mode == "record":
+    if not keep_prompts:
+        for batch_item in batch:
+            for key in list(batch_item.keys()):
+                prompt_col = SchemaService.col_to_prompt(col_name=key, prompt_data=batch_item)
+                if prompt_col in batch_item:
+                    del batch_item[prompt_col]
+def _infer_batch(return_type, batch, batch_ind, **kwargs):
+    assert (return_type in ["batch", "chunk", "record"])
+    # Filling batch with inference content.
+    for ind_in_batch, column, chunk in _column_ordered_chunks_iter(batch=batch, **kwargs):
+        if return_type == "chunk":
+            global_ind = batch_ind * len(batch) + ind_in_batch
+            yield [global_ind, column, chunk]
+    if return_type == "record":
         for record in batch:
             yield record
-    if return_mode == "batch":
+    if return_type == "batch":
         yield batch
+def get_infer_mode(stream, batch_size, async_mode):
+    if not stream and batch_size == 1:
+        return 'single', 'record'
+    elif not stream and batch_size > 1:
+        if async_mode:
+            return 'batch_async', 'batch'
+        else:
+            return 'batch', 'batch'
+    elif stream and batch_size == 1:
+        return 'single_stream', 'chunk'
+    elif stream and batch_size > 1:
+        return 'batch_stream_async', 'chunk'
+    raise ValueError(f"Invalid combination of stream and batch_size: {stream}, {batch_size}")
 def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
-                 infer_mode="batch", return_mode="batch", attempts=1, event_loop=None,
-                 **kwargs):
+                 stream=False, async_mode=False, attempts=1, event_loop=None,
+                 handle_missed_value_func=lambda *_: None, **kwargs):
     """ This method represent Python API aimed at application of `llm` towards
         iterator of input_dicts via cache_target that refers to the SQLite using
         the given `schema`
     """
-    assert (infer_mode in INFER_MODES.keys())
-    assert (return_mode in ["batch", "chunk", "record"])
     assert (isinstance(llm, BaseLM))
+    assert (isinstance(batch_size, int) and batch_size > 0)
+    assert (isinstance(async_mode, bool))
+    infer_type, return_type = get_infer_mode(stream=stream, batch_size=batch_size, async_mode=async_mode)
+    infer_mode = INFER_MODES[infer_type]
     # Setup event loop.
     event_loop = asyncio.get_event_loop_policy().get_event_loop() \
@@ -149,13 +189,15 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
         schema = JsonService.read(schema)
     if isinstance(schema, dict):
         schema = SchemaService(json_data=schema)
+    if isinstance(schema, list):
+        schema = SchemaService(json_data={"schema": schema})
     prompts_it = map(
         lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
         input_dicts_it
     )
-    handle_batch_func = lambda batch, **handle_kwargs: INFER_MODES[infer_mode](
+    handle_batch_func = lambda batch, **handle_kwargs: infer_mode(
         llm,
         DataService.limit_prompts(batch, limit=limit_prompt),
         **handle_kwargs
@@ -172,12 +214,13 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
                                       logger=logger)
         handle_batch_func = attempt_dec(handle_batch_func)
-    content_it = (_infer_batch(batch=batch,
+    kwargs["handle_missed_value_func"] = handle_missed_value_func
+    content_it = (_infer_batch(return_type=return_type,
+                               batch=batch,
                                batch_ind=batch_ind,
                                infer_mode=infer_mode,
                                handle_batch_func=handle_batch_func,
-                               handle_missed_value_func=lambda *_: None,
-                               return_mode=return_mode,
                                schema=schema,
                                event_loop=event_loop,
                                **kwargs)

bulk_chain/core/llm_base.py CHANGED Viewed

@@ -3,12 +3,17 @@ class BaseLM(object):
     def __init__(self, **kwargs):
         pass
-    def ask(self, content):
+    def ask(self, prompt):
         """ Assumes to return str.
         """
         raise NotImplemented()
-    def ask_stream(self, content):
+    def ask_batch(self, batch):
+        """ Assumes to return generator.
+        """
+        raise NotImplemented()
+    def ask_stream(self, prompt):
         """ Assumes to return generator.
         """
         raise NotImplemented()
@@ -18,7 +23,7 @@ class BaseLM(object):
         """
         raise NotImplemented()
-    async def ask_stream_async(self, batch):
+    async def ask_stream_async(self, prompt):
         """ Assumes to return AsyncGenerator.
         """
         raise NotImplemented()

bulk_chain/core/service_schema.py CHANGED Viewed

@@ -9,6 +9,10 @@ class SchemaService(object):
         prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
         return cls(prompt_schema)
+    @staticmethod
+    def col_to_prompt(col_name, prompt_data):
+        return col_name + "_prompt" if "in" not in prompt_data else prompt_data["in"]
     @staticmethod
     def __init_schema(prompts):
@@ -19,7 +23,7 @@ class SchemaService(object):
         for prompt in prompts:
             r_col_name = prompt["out"]
-            p_col_name = r_col_name + "_prompt" if "in" not in prompt else prompt["in"]
+            p_col_name = SchemaService.col_to_prompt(col_name=r_col_name, prompt_data=prompt)
             assert r_col_name not in schema_r2p, f"`{r_col_name}` has been already declared!"
             assert p_col_name not in schema_p2r, f"`{p_col_name}` has been already declared!"

bulk_chain/core/utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import ast
 import importlib
 import logging
 import sys
@@ -35,18 +36,30 @@ def find_by_prefix(d, key):
     return d[matches[0]]
+def check_is_param_name(param_name):
+    return param_name.replace("_", "").isalpha()
 def iter_params(text):
     assert(isinstance(text, str))
     beg = 0
     while beg < len(text):
+        print(beg)
         try:
             pb = text.index('{', beg)
         except ValueError:
             break
-        pe = text.index('}', beg+1)
-        # Yield argument.
-        yield text[pb+1:pe]
-        beg = pe+1
+        pe = text.index('}', pb+1)
+        param_name = text[pb + 1:pe]
+        # Check parameter validity.
+        if not check_is_param_name(param_name):
+            beg = pb + 1
+            continue
+        # Passing.
+        yield param_name
+        beg = pe + 1
 def auto_import(name, is_class=False):
@@ -61,6 +74,17 @@ def auto_import(name, is_class=False):
     return m() if is_class else m
+def get_class_name(file_path):
+    with open(file_path, 'r') as f:
+        tree = ast.parse(f.read(), filename=file_path)
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef):
+            return node.name
+    return None
 def dynamic_init(class_filepath, class_name=None):
     # Registering path.
@@ -75,7 +99,7 @@ def dynamic_init(class_filepath, class_name=None):
         class_filename = class_filename[:-len(".py")]
     # Loading library.
-    class_name = class_path_list[-1].title() if class_name is None else class_name
+    class_name = get_class_name(class_filepath) if class_name is None else class_name
     class_path = ".".join([class_filename, class_name])
     logger.info(f"Dynamic loading for the file and class `{class_path}`")
     cls = auto_import(class_path, is_class=False)

{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.1
-Name: bulk_chain
-Version: 1.1.0
+Name: bulk-chain
+Version: 1.2.1
 Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
 Home-page: https://github.com/nicolay-r/bulk-chain
 Author: Nicolay Rusnachenko
 Author-email: rusnicolay@gmail.com
 License: MIT License
 Keywords: natural language processing,chain-of-thought,reasoning
+Platform: UNKNOWN
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
@@ -14,9 +15,8 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.6
 Description-Content-Type: text/markdown
-License-File: LICENSE
-# bulk-chain 1.1.0
+# bulk-chain 1.2.1
 ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
 [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
 [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -55,24 +55,17 @@ pip install git+https://github.com/nicolay-r/bulk-chain@master
 ## Chain-of-Thought Schema
-To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
-This format adopts `name` field for declaring a name and `schema` is a list of CoT instructions for the Large Language Model.
-Each step represents a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
-All the variable names are expected to be mentioned in `{}`.
-Below, is an example on how to declare your own schema:
+To declare Chain-of-Though (CoT) schema we use `JSON` format.
+The field `schema` is a list of CoT instructions for the Large Language Model.
+Each item of the list represent a dictionary with `prompt` and `out` keys that corresponds to the input prompt and output variable name respectively.
+All the variable names should be mentioned in `{}`.
+**Example**:
 ```python
-{
-"name": "schema-name",
-"schema": [
-    {"prompt": "Given the question '{text}', let's think step-by-step.",
-     "out": "steps"},
-    {"prompt": "For the question '{text}' the reasoining steps are '{steps}'. what would be an answer?",
-     "out":  "answer"},
+[
+  {"prompt": "extract topic: {text}", "out": "topic"},
+  {"prompt": "extract subject: {text}", "out": "subject"},
 ]
-}
 ```
 # Usage
@@ -94,25 +87,50 @@ from bulk_chain.api import iter_content
 content_it = iter_content(
     # 1. Your schema.
-    schema="YOUR_SCHEMA.json",
+    schema=[
+      {"prompt": "extract topic: {text}", "out": "topic" },
+      {"prompt": "extract subject: {text}", "out": "subject"},
+    ],
     # 2. Your third-party model implementation.
-    llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
-    # 3. Customize your inference and result providing modes:
-    infer_mode="batch_async",
-    return_mode="batch",
-    # 4. Your iterator of dictionaries
-    input_dicts_it=YOUR_DATA_IT,
+    llm=dynamic_init(class_filepath="replicate_104.py")(
+       api_token="<API-KEY>",
+       model_name="meta/meta-llama-3-70b-instruct"),
+    # 3. Toggle streaming if needed
+    stream=False,
+    # 4. Toggle Async API mode usage.
+    async_mode=True,
+    # 5. Batch size.
+    batch_size=10,
+    # 6. Your iterator of dictionaries
+    input_dicts_it=[
+        # Example of data ...
+        { "text": "Rocks are hard" },
+        { "text": "Water is wet" },
+        { "text": "Fire is hot" }
+    ],
 )
-for content in content_it:
-    # Handle your LLM responses here ...
+for batch in content_it:
+   for entry in batch:
+      print(entry)
 ```
+Outputs entries represent texts augmented with `topic` and `subject`:
+```jsonl
+{'text': 'Rocks are hard', 'topic': 'The topic is: Geology/Rocks', 'subject': 'The subject is: "Rocks"'}
+{'text': 'Water is wet', 'topic': 'The topic is: Properties of Water', 'subject': 'The subject is: Water'}
+{'text': 'Fire is hot', 'topic': 'The topic is: Temperature/Properties of Fire', 'subject': 'The subject is: "Fire"'}
+```
-# Embed your LLM
+# API
-All you have to do is to implement `BaseLM` class, that includes:
-* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
-* `ask(prompt)` -- infer your model with the given `prompt`.
+| Method               | Mode       | Description                                                         |
+|----------------------|------------|---------------------------------------------------------------------|
+| `ask(prompt)`        | Sync       | Infers the model with a single prompt.                              |
+| `ask_stream(prompt)` | Sync       | Returns a generator that yields chunks of the inferred result.      |
+| `ask_async(prompt)`  | Async      | Asynchronously infers the model with a single prompt.               |
+| `ask_stream_async(prompt)` | Async | Asynchronously returns a generator of result chunks of the inferred result.          |
 See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).

bulk_chain-1.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/api.py,sha256=bLZXdp58i6LDayZQxRBxsFK4lVT8cZZn1uOY0iaZ5TE,8500
+bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/core/llm_base.py,sha256=H2KmCqChKp9sKOkROE-4zjMRCxizT9xWvNZSF22HeFU,673
+bulk_chain/core/service_asyncio.py,sha256=S-D4K3LBa3noKTm0tXazluYVI8cBgN1IB6v6MFoMyNQ,1972
+bulk_chain/core/service_batch.py,sha256=lWmjO0aU6h2rmfx_kGmNqt0Rdeaf2a4Dn5VyfKFkfDs,1033
+bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
+bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
+bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
+bulk_chain/core/service_schema.py,sha256=YAsdm3N2G4-eTpeJazg4Y-KQ2w9bEPpqreVl8a-M7H0,1311
+bulk_chain/core/utils.py,sha256=hml0zLmnZe865gvc1CagEzRE19Gdh1pF8kx_KueDY3A,3667
+bulk_chain-1.2.1.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
+bulk_chain-1.2.1.dist-info/METADATA,sha256=xx1vcG6wkHzh_Ga3iZJV3MBdR97RBGpCf7JO5_lonN0,5339
+bulk_chain-1.2.1.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
+bulk_chain-1.2.1.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
+bulk_chain-1.2.1.dist-info/RECORD,,

{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.1.0)
+Generator: bdist_wheel (0.34.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

bulk_chain-1.1.0.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/api.py,sha256=gPGjaHYIn2Ewn6yXIXER-CM5SgXQ3ZJH-SdRyaPDOo0,6890
-bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/core/llm_base.py,sha256=aa73TGW03yLXMHY4b_1NgquRvP0CzH8IWZkcFPABFUg,557
-bulk_chain/core/service_asyncio.py,sha256=S-D4K3LBa3noKTm0tXazluYVI8cBgN1IB6v6MFoMyNQ,1972
-bulk_chain/core/service_batch.py,sha256=lWmjO0aU6h2rmfx_kGmNqt0Rdeaf2a4Dn5VyfKFkfDs,1033
-bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
-bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
-bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
-bulk_chain/core/service_schema.py,sha256=KIP4n0Tz2h1i7SIMGhgAhoiCgUFXOT1rzMt38yACS2U,1154
-bulk_chain/core/utils.py,sha256=tp1FJQBmJt-3QmG7B0hyJNTFyg_8BwTTdl8xTxSgNDk,3140
-bulk_chain-1.1.0.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
-bulk_chain-1.1.0.dist-info/METADATA,sha256=EheCGDisKF0TwmzJfnDxW-rgsDVPNpCYGOvuaDn91tw,4428
-bulk_chain-1.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-bulk_chain-1.1.0.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
-bulk_chain-1.1.0.dist-info/RECORD,,

{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{bulk_chain-1.1.0.dist-info → bulk_chain-1.2.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

bulk-chain 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

bulk-chain 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl