PyPI - bulk-chain - Versions diffs - 0.25.3__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

bulk-chain 0.25.3py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

bulk_chain/api.py +129 -71
bulk_chain/core/llm_base.py +17 -45
bulk_chain/core/service_asyncio.py +65 -0
bulk_chain/core/service_batch.py +2 -21
bulk_chain/core/utils.py +21 -24
{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/METADATA +29 -58
bulk_chain-1.1.0.dist-info/RECORD +16 -0
{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/WHEEL +1 -1
bulk_chain/core/provider_sqlite.py +0 -127
bulk_chain/core/service_args.py +0 -72
bulk_chain/core/service_llm.py +0 -68
bulk_chain/core/utils_logger.py +0 -41
bulk_chain/demo.py +0 -84
bulk_chain/infer.py +0 -193
bulk_chain-0.25.3.dist-info/RECORD +0 -21
{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/LICENSE +0 -0
{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/top_level.txt +0 -0

bulk_chain/api.py CHANGED Viewed

@@ -1,70 +1,91 @@
+import asyncio
 import collections
+import logging
 import os
 from itertools import chain
 from bulk_chain.core.llm_base import BaseLM
-from bulk_chain.core.service_batch import BatchIterator, BatchService
+from bulk_chain.core.service_asyncio import AsyncioService
+from bulk_chain.core.service_batch import BatchIterator
 from bulk_chain.core.service_data import DataService
 from bulk_chain.core.service_dict import DictionaryService
 from bulk_chain.core.service_json import JsonService
 from bulk_chain.core.service_schema import SchemaService
-from bulk_chain.core.utils import dynamic_init, find_by_prefix
+from bulk_chain.core.utils import attempt_wrapper
 INFER_MODES = {
-    "batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
-        DataService.limit_prompts(batch, limit=limit_prompt))
+    "single": lambda llm, batch, **kwargs: [llm.ask(prompt) for prompt in batch],
+    "single_stream": lambda llm, batch, **kwargs: [llm.ask_stream(prompt) for prompt in batch],
+    "batch": lambda llm, batch, **kwargs: llm.ask(batch),
+    "batch_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
+        batch=batch, async_handler=llm.ask_async, event_loop=kwargs.get("event_loop")
+    ),
+    "batch_stream_async": lambda llm, batch, **kwargs: AsyncioService.run_tasks(
+        batch=batch, async_handler=llm.ask_stream_async, event_loop=kwargs.get("event_loop")
+    ),
 }
 CWD = os.getcwd()
-def _handle_entry(entry, entry_info=None, **kwargs):
+def _iter_batch_prompts(c, batch_content_it, **kwargs):
+    for ind_in_batch, entry in enumerate(batch_content_it):
+        content = DataService.get_prompt_text(
+            prompt=entry[c]["prompt"],
+            data_dict=entry,
+            handle_missed_func=kwargs["handle_missed_value_func"])
+        yield ind_in_batch, content
-    if isinstance(entry, str):
-        kwargs.get("callback_str_func", lambda *_: None)(entry, entry_info)
-        return entry
-    elif isinstance(entry, collections.abc.Iterable):
-        chunks = []
-        h = kwargs.get("callback_stream_func", lambda *_: None)
-        h(None, entry_info | {"action": "start"})
+def __handle_agen_to_gen(handle, batch, event_loop):
+    """ This handler provides conversion of the async generator to generator (sync).
+    """
-        for chunk in map(lambda item: str(item), entry):
-            chunks.append(chunk)
-            h(chunk, entry_info)
+    def __wrap_with_index(async_gens):
+        async def wrapper(index, agen):
+            async for item in agen:
+                yield index, item
+        return [wrapper(i, agen) for i, agen in enumerate(async_gens)]
-        h(None, entry_info | {"action": "end"})
+    agen_list = handle(batch, event_loop=event_loop)
-        return "".join(chunks)
+    it = AsyncioService.async_gen_to_iter(
+        gen=AsyncioService.merge_generators(*__wrap_with_index(agen_list)),
+        loop=event_loop)
-    raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
+    for ind_in_batch, chunk in it:
+        yield ind_in_batch, str(chunk)
-def _update_batch_content(c, batch, schema, **kwargs):
-    assert (isinstance(batch, list))
-    assert (isinstance(c, str))
-    if c in schema.p2r:
-        for batch_item in batch:
-            batch_item[c] = DataService.get_prompt_text(
-                prompt=batch_item[c]["prompt"],
-                data_dict=batch_item,
-                handle_missed_func=kwargs["handle_missed_value_func"])
-    if c in schema.r2p:
-        p_column = schema.r2p[c]
-        # This instruction takes a lot of time in a non-batching mode.
-        BatchService.handle_param_as_batch(
-            batch=batch,
-            src_param=p_column,
-            tgt_param=c,
-            handle_batch_func=lambda b: kwargs["handle_batch_func"](b),
-            handle_entry_func=lambda entry, info: _handle_entry(entry=entry, entry_info=info, **kwargs)
-        )
-def _infer_batch(batch, schema, cols=None, **kwargs):
+def __handle_gen(handle, batch, event_loop):
+    """ This handler deals with the iteration of each individual element of the batch.
+    """
+    def _iter_entry_content(entry):
+        if isinstance(entry, str):
+            yield entry
+        elif isinstance(entry, collections.abc.Iterable):
+            for chunk in map(lambda item: str(item), entry):
+                yield chunk
+        else:
+            raise Exception(f"Non supported type `{type(entry)}` for handling output from batch")
+    for ind_in_batch, entry in enumerate(handle(batch, event_loop=event_loop)):
+        for chunk in _iter_entry_content(entry=entry):
+            yield ind_in_batch, chunk
+def _iter_chunks(p_column, batch_content_it, **kwargs):
+    handler = __handle_agen_to_gen if kwargs["infer_mode"] == "batch_stream_async" else __handle_gen
+    p_batch = [item[p_column] for item in batch_content_it]
+    it = handler(handle=kwargs["handle_batch_func"], batch=p_batch, event_loop=kwargs["event_loop"])
+    for ind_in_batch, chunk in it:
+        yield ind_in_batch, chunk
+def _infer_batch(batch, batch_ind, schema, return_mode, cols=None, **kwargs):
     assert (isinstance(batch, list))
     if len(batch) == 0:
@@ -75,18 +96,54 @@ def _infer_batch(batch, schema, cols=None, **kwargs):
         cols = list(first_item.keys()) if cols is None else cols
     for c in cols:
-        _update_batch_content(c=c, batch=batch, schema=schema, **kwargs)
-    return batch
-def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, limit_prompt=None, **kwargs):
+        # Handling prompt column.
+        if c in schema.p2r:
+            content_it = _iter_batch_prompts(c=c, batch_content_it=iter(batch), **kwargs)
+            for ind_in_batch, prompt in content_it:
+                batch[ind_in_batch][c] = prompt
+        # Handling column for inference.
+        if c in schema.r2p:
+            content_it = _iter_chunks(p_column=schema.r2p[c], batch_content_it=iter(batch), **kwargs)
+            # Register values.
+            for item in batch:
+                item[c] = []
+            for ind_in_batch, chunk in content_it:
+                # Append batch.
+                batch[ind_in_batch][c].append(chunk)
+                # Returning (optional).
+                if return_mode == "chunk":
+                    global_ind = batch_ind * len(batch) + ind_in_batch
+                    yield [global_ind, c, chunk]
+            # Convert content to string.
+            for item in batch:
+                item[c] = "".join(item[c])
+    if return_mode == "record":
+        for record in batch:
+            yield record
+    if return_mode == "batch":
+        yield batch
+def iter_content(input_dicts_it, llm, schema, batch_size=1, limit_prompt=None,
+                 infer_mode="batch", return_mode="batch", attempts=1, event_loop=None,
+                 **kwargs):
     """ This method represent Python API aimed at application of `llm` towards
         iterator of input_dicts via cache_target that refers to the SQLite using
         the given `schema`
     """
+    assert (infer_mode in INFER_MODES.keys())
+    assert (return_mode in ["batch", "chunk", "record"])
     assert (isinstance(llm, BaseLM))
+    # Setup event loop.
+    event_loop = asyncio.get_event_loop_policy().get_event_loop() \
+        if event_loop is None else event_loop
     # Quick initialization of the schema.
     if isinstance(schema, str):
         schema = JsonService.read(schema)
@@ -94,35 +151,36 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, l
         schema = SchemaService(json_data=schema)
     prompts_it = map(
-        lambda data: DictionaryService.custom_update(src_dict=data, other_dict=schema.cot_args),
+        lambda data: DictionaryService.custom_update(src_dict=dict(data), other_dict=schema.cot_args),
         input_dicts_it
     )
+    handle_batch_func = lambda batch, **handle_kwargs: INFER_MODES[infer_mode](
+        llm,
+        DataService.limit_prompts(batch, limit=limit_prompt),
+        **handle_kwargs
+    )
+    # Optional wrapping into attempts.
+    if attempts > 1:
+        # Optional setup of the logger.
+        logger = logging.getLogger(__name__)
+        logging.basicConfig(level=logging.INFO)
+        attempt_dec = attempt_wrapper(attempts=attempts,
+                                      delay_sec=kwargs.get("attempt_delay_sec", 1),
+                                      logger=logger)
+        handle_batch_func = attempt_dec(handle_batch_func)
     content_it = (_infer_batch(batch=batch,
-                               handle_batch_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
+                               batch_ind=batch_ind,
+                               infer_mode=infer_mode,
+                               handle_batch_func=handle_batch_func,
+                               handle_missed_value_func=lambda *_: None,
+                               return_mode=return_mode,
                                schema=schema,
+                               event_loop=event_loop,
                                **kwargs)
-                  for batch in BatchIterator(prompts_it, batch_size=batch_size))
-    yield from content_it if return_batch else chain.from_iterable(content_it)
+                  for batch_ind, batch in enumerate(BatchIterator(prompts_it, batch_size=batch_size)))
-def init_llm(adapter, **model_kwargs):
-    """ This method perform dynamic initialization of LLM from third-party resource.
-    """
-    assert (isinstance(adapter, str))
-    # List of the Supported models and their API wrappers.
-    models_preset = {
-        "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
-                                        class_name=llm_model_params)(**model_kwargs)
-    }
-    # Initialize LLM model.
-    params = adapter.split(':')
-    llm_model_type = params[0]
-    llm_model_name = params[1] if len(params) > 1 else params[-1]
-    llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
-    llm = find_by_prefix(d=models_preset, key=llm_model_type)()
-    return llm, llm_model_name
+    yield from chain.from_iterable(content_it)

bulk_chain/core/llm_base.py CHANGED Viewed

@@ -1,52 +1,24 @@
-import logging
-import time
-from bulk_chain.core.utils import format_model_name
 class BaseLM(object):
-    def __init__(self, name=None, attempts=None, delay_sec=1, enable_log=True,
-                 support_batching=False, **kwargs):
-        self.__name = name
-        self.__attempts = 1 if attempts is None else attempts
-        self.__delay_sec = delay_sec
-        self.__support_batching = support_batching
-        if enable_log:
-            self.__logger = logging.getLogger(__name__)
-            logging.basicConfig(level=logging.INFO)
-    def ask_core(self, batch):
-        for i in range(self.__attempts):
-            try:
-                if self.__support_batching:
-                    # Launch in batch mode.
-                    content = batch
-                else:
-                    # Launch in non-batch mode.
-                    assert len(batch) == 1, "The LM does not support batching," \
-                                            f" while size of the content is {len(batch)} which is not equal 1. " \
-                                            f"Please enable batch-supporting or set required inference settings."
-                    content = batch[0]
+    def __init__(self, **kwargs):
+        pass
-                response = self.ask(content)
-                # Wrapping into batch the response in the case of non-batching mode.
-                return response if self.__support_batching else [response]
-            except Exception as e:
-                if self.__logger is not None:
-                    self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
-                    self.__logger.info(e)
-                time.sleep(self.__delay_sec)
+    def ask(self, content):
+        """ Assumes to return str.
+        """
+        raise NotImplemented()
-        raise Exception("Can't infer")
+    def ask_stream(self, content):
+        """ Assumes to return generator.
+        """
+        raise NotImplemented()
-    def ask(self, content):
+    async def ask_async(self, prompt):
+        """ Assumes to return co-routine.
+        """
         raise NotImplemented()
-    def name(self):
-        return format_model_name(self.__name)
+    async def ask_stream_async(self, batch):
+        """ Assumes to return AsyncGenerator.
+        """
+        raise NotImplemented()

bulk_chain/core/service_asyncio.py ADDED Viewed

@@ -0,0 +1,65 @@
+import asyncio
+from typing import AsyncGenerator, Any
+class AsyncioService:
+    @staticmethod
+    async def _run_tasks_async(batch, async_handler):
+        tasks = [async_handler(prompt) for prompt in batch]
+        return await asyncio.gather(*tasks)
+    @staticmethod
+    async def _run_generator(gen, output_queue, idx):
+        try:
+            async for item in gen:
+                await output_queue.put((idx, item))
+        finally:
+            await output_queue.put((idx, StopAsyncIteration))
+    @staticmethod
+    def run_tasks(event_loop, **tasks_kwargs):
+        return event_loop.run_until_complete(AsyncioService._run_tasks_async(**tasks_kwargs))
+    @staticmethod
+    async def merge_generators(*gens: AsyncGenerator[Any, None]) -> AsyncGenerator[Any, None]:
+        output_queue = asyncio.Queue()
+        tasks = [
+            asyncio.create_task(AsyncioService._run_generator(gen, output_queue, idx))
+            for idx, gen in enumerate(gens)
+        ]
+        finished = set()
+        while len(finished) < len(tasks):
+            idx, item = await output_queue.get()
+            if item is StopAsyncIteration:
+                finished.add(idx)
+            else:
+                yield item
+        for task in tasks:
+            task.cancel()
+    @staticmethod
+    def async_gen_to_iter(gen, loop=None):
+        """ This approach is limited. Could be considered as legacy.
+            https://stackoverflow.com/questions/71580727/translating-async-generator-into-sync-one/78573267#78573267
+        """
+        loop_created = False
+        if loop is None:
+            loop_created = True
+            loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            while True:
+                try:
+                    yield loop.run_until_complete(gen.__anext__())
+                except StopAsyncIteration:
+                    break
+        finally:
+            if loop_created:
+                loop.close()

bulk_chain/core/service_batch.py CHANGED Viewed

@@ -1,27 +1,8 @@
-class BatchService(object):
-    @staticmethod
-    def handle_param_as_batch(batch, src_param, tgt_param, handle_batch_func, handle_entry_func):
-        assert (isinstance(batch, list))
-        assert (isinstance(src_param, str))
-        assert (callable(handle_batch_func))
-        _batch = [item[src_param] for item in batch]
-        # Do handling for the batch.
-        _handled_batch = handle_batch_func(_batch)
-        assert (isinstance(_handled_batch, list))
-        # Apply changes.
-        for i, item in enumerate(batch):
-            item[tgt_param] = handle_entry_func(entry=_handled_batch[i], info={"ind": i, "param": tgt_param})
 class BatchIterator:
     def __init__(self, data_iter, batch_size, end_value=None, filter_func=None):
-        assert(isinstance(batch_size, int) and batch_size > 0)
-        assert(callable(end_value) or end_value is None)
+        assert (isinstance(batch_size, int) and batch_size > 0)
+        assert (callable(end_value) or end_value is None)
         self.__data_iter = data_iter
         self.__index = 0
         self.__batch_size = batch_size

bulk_chain/core/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import importlib
 import logging
 import sys
+import time
 from collections import Counter
 from os.path import dirname, join, basename
@@ -48,28 +49,6 @@ def iter_params(text):
         beg = pe+1
-def format_model_name(name):
-    return name.replace("/", "_")
-def parse_filepath(filepath, default_filepath=None, default_ext=None):
-    """ This is an auxiliary function for handling sources and targets from cmd string.
-    """
-    if filepath is None:
-        return default_filepath, default_ext, None
-    info = filepath.split(":")
-    filepath = info[0]
-    meta = info[1] if len(info) > 1 else None
-    ext = filepath.split('.')[-1] if default_ext is None else default_ext
-    return filepath, ext, meta
-def handle_table_name(name):
-    return name.\
-        replace('-', '_').\
-        replace('.', "_")
 def auto_import(name, is_class=False):
     """ Import from the external python packages.
     """
@@ -82,10 +61,10 @@ def auto_import(name, is_class=False):
     return m() if is_class else m
-def dynamic_init(class_dir, class_filepath, class_name=None):
+def dynamic_init(class_filepath, class_name=None):
     # Registering path.
-    target = join(class_dir, dirname(class_filepath))
+    target = join(dirname(class_filepath))
     logger.info(f"Adding sys path for `{target}`")
     sys.path.insert(1, target)
     class_path_list = class_filepath.split('/')
@@ -111,3 +90,21 @@ def optional_limit_iter(it_data, limit=None):
         if limit is not None and counter["returned"] > limit:
             break
         yield data
+def attempt_wrapper(attempts, delay_sec=1, logger=None):
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            for i in range(attempts):
+                try:
+                    # Do action.
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    if logger is not None:
+                        logger.info(f"Unable to infer the result. Try {i} out of {attempts}.")
+                        logger.info(e)
+                    if delay_sec is not None:
+                        time.sleep(delay_sec)
+            raise Exception(f"Failed after {attempts} attempts")
+        return wrapper
+    return decorator

{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bulk_chain
-Version: 0.25.3
+Version: 1.1.0
 Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
 Home-page: https://github.com/nicolay-r/bulk-chain
 Author: Nicolay Rusnachenko
@@ -15,10 +15,8 @@ Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.6
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: tqdm
-Requires-Dist: source-iter ==0.24.3
-# bulk-chain 0.25.3
+# bulk-chain 1.1.0
 ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
 [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
 [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -31,7 +29,7 @@ Requires-Dist: source-iter ==0.24.3
 <p align="center">
   <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
   <br>
-  <a href="https://github.com/nicolay-r/bulk-chain/blob/master/README.md#demo-mode">👉<b>demo</b>👈</a>
+  <a href="https://github.com/nicolay-r/bulk-chain-shell">👉<b>demo</b>👈</a>
 </p>
 A no-strings-attached **framework**  for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
@@ -39,11 +37,7 @@ A no-strings-attached **framework**  for your LLM that allows applying Chain-of-
 ### Main Features
 * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
 * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
-* ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
-### Extra Features
-* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
+* ✅ **Provides iterator over infinite amount of input contexts**
 # Installation
@@ -83,60 +77,37 @@ Below, is an example on how to declare your own schema:
 # Usage
-Preliminary steps:
-1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
-2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
-## Shell
-### Demo Mode
-**demo mode** to interact with LLM via command line with LLM output streaming support.
-The video below illustrates an example of application for sentiment analysis on author opinion extraction towards mentioned object in text.
-Quck start with launching demo:
-1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
-2. 📜 Setup your reasoning `thor_cot_schema.json` according to the [following example ↗️](test/schema/thor_cot_schema.json)
-3. 🚀 Launch `demo.py` as follows:
-```bash
-python3 -m bulk_chain.demo \
-    --schema "test/schema/thor_cot_schema.json" \
-    --adapter "dynamic:replicate_104.py:Replicate" \
-    %%m \
-    --model_name "meta/meta-llama-3-70b-instruct" \
-    --api_token "<REPLICATE-API-TOKEN>" \
-    --stream
-```
-📺 This video showcase application of the [↗️ Sentiment Analysis Schema](https://github.com/nicolay-r/bulk-chain/blob/master/test/schema/thor_cot_schema.json) towards [LLaMA-3-70B-Instruct](https://replicate.com/meta/meta-llama-3-70b-instruct) hosted by Replicate for reasoning over submitted texts
-![sa-bulk-chain-cot-final](https://github.com/user-attachments/assets/0cc8fdcb-6ddb-44a3-8f05-d76250ae6423)
+## 🤖 Prepare
+1. [schema](#chain-of-thought-schema)
+    * [Example for Sentiment Analysis](test/schema/thor_cot_schema.json)
+2. **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
+3. Data (iter of dictionaries)
-### Inference Mode
+## 🚀 Launch
-> **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
+> **API**: For more details see the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
-1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
-```bash
-wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
-```
-2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
-3. 🚀 Launch inference using `DeepSeek-R1`:
-```bash
-python3 -m bulk_chain.infer \
-    --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
-    --schema "test/schema/default.json" \
-    --adapter "replicate_104.py:Replicate" \
-    %%m \
-    --model_name "deepseek-ai/deepseek-r1" \
-    --api_token "<REPLICATE-API-TOKEN>"
+```python
+from bulk_chain.core.utils import dynamic_init
+from bulk_chain.api import iter_content
+content_it = iter_content(
+    # 1. Your schema.
+    schema="YOUR_SCHEMA.json",
+    # 2. Your third-party model implementation.
+    llm=dynamic_init(class_filepath="replicate_104.py", class_name="Replicate")(api_token="<API-KEY>"),
+    # 3. Customize your inference and result providing modes:
+    infer_mode="batch_async",
+    return_mode="batch",
+    # 4. Your iterator of dictionaries
+    input_dicts_it=YOUR_DATA_IT,
+)
+for content in content_it:
+    # Handle your LLM responses here ...
 ```
-## API
-Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
 # Embed your LLM

bulk_chain-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/api.py,sha256=gPGjaHYIn2Ewn6yXIXER-CM5SgXQ3ZJH-SdRyaPDOo0,6890
+bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/core/llm_base.py,sha256=aa73TGW03yLXMHY4b_1NgquRvP0CzH8IWZkcFPABFUg,557
+bulk_chain/core/service_asyncio.py,sha256=S-D4K3LBa3noKTm0tXazluYVI8cBgN1IB6v6MFoMyNQ,1972
+bulk_chain/core/service_batch.py,sha256=lWmjO0aU6h2rmfx_kGmNqt0Rdeaf2a4Dn5VyfKFkfDs,1033
+bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
+bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
+bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
+bulk_chain/core/service_schema.py,sha256=KIP4n0Tz2h1i7SIMGhgAhoiCgUFXOT1rzMt38yACS2U,1154
+bulk_chain/core/utils.py,sha256=tp1FJQBmJt-3QmG7B0hyJNTFyg_8BwTTdl8xTxSgNDk,3140
+bulk_chain-1.1.0.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
+bulk_chain-1.1.0.dist-info/METADATA,sha256=EheCGDisKF0TwmzJfnDxW-rgsDVPNpCYGOvuaDn91tw,4428
+bulk_chain-1.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+bulk_chain-1.1.0.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
+bulk_chain-1.1.0.dist-info/RECORD,,

{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (74.1.3)
+Generator: setuptools (75.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

bulk_chain/core/provider_sqlite.py DELETED Viewed

@@ -1,127 +0,0 @@
-import sqlite3
-class SQLite3Service(object):
-    @staticmethod
-    def __create_table(table_name, columns, id_column_name,
-                       id_column_type, cur, sqlite3_column_types=None):
-        # Setting up default column types.
-        if sqlite3_column_types is None:
-            types_count = len(columns) if id_column_name in columns else len(columns) - 1
-            sqlite3_column_types = ["TEXT"] * types_count
-        # Provide the ID column.
-        sqlite3_column_types = [id_column_type] + sqlite3_column_types
-        # Compose the whole columns list.
-        content = ", ".join([f"[{item[0]}] {item[1]}" for item in zip(columns, sqlite3_column_types)])
-        cur.execute(f"CREATE TABLE IF NOT EXISTS {table_name}({content})")
-        cur.execute(f"CREATE INDEX IF NOT EXISTS [{id_column_name}] ON {table_name}([{id_column_name}])")
-    @staticmethod
-    def __it_row_lists(cursor):
-        for row in cursor:
-            yield row
-    @staticmethod
-    def create_table_if_not_exist(**kwargs):
-        return SQLite3Service.__create_table(**kwargs)
-    @staticmethod
-    def entry_exist(table_name, target, id_column_name, id_value, **connect_kwargs) -> bool:
-        with sqlite3.connect(target, **connect_kwargs) as con:
-            cursor = con.cursor()
-            # Check table existance.
-            query = "SELECT name FROM sqlite_master WHERE type='table' AND name=?"
-            cursor.execute(query, (table_name,))
-            if cursor.fetchone() is None:
-                return False
-            # Check element.
-            r = cursor.execute(f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE [{id_column_name}]='{id_value}');")
-            ans = r.fetchone()[0]
-            return ans == 1
-    @staticmethod
-    def write(data_it, target, table_name, columns=None, id_column_name="id", data2col_func=None,
-              id_column_type="INTEGER", sqlite3_column_types=None, it_type='dict',
-              create_table_if_not_exist=True, skip_existed=True, **connect_kwargs):
-        need_set_column_id = True
-        need_initialize_columns = columns is None
-        # Setup default columns.
-        columns = [] if columns is None else columns
-        with sqlite3.connect(target, **connect_kwargs) as con:
-            cur = con.cursor()
-            for content in data_it:
-                if it_type == 'dict':
-                    # Extracting columns from data.
-                    data = content
-                    uid = data[id_column_name]
-                    row_columns = list(data.keys())
-                    row_params_func = lambda: [data2col_func(c, data) if data2col_func is not None else data[c]
-                                               for c in row_columns]
-                    # Append columns if needed.
-                    if need_initialize_columns:
-                        columns = list(row_columns)
-                elif it_type is None:
-                    # Setup row columns.
-                    uid, data = content
-                    row_columns = columns
-                    row_params_func = lambda: [uid] + data
-                else:
-                    raise Exception(f"it_type {it_type} does not supported!")
-                if need_set_column_id:
-                    # Register ID column.
-                    if id_column_name not in columns:
-                        columns.append(id_column_name)
-                    # Place ID column first.
-                    columns.insert(0, columns.pop(columns.index(id_column_name)))
-                    need_set_column_id = False
-                if create_table_if_not_exist:
-                    SQLite3Service.__create_table(
-                        columns=columns, table_name=table_name, cur=cur,
-                        id_column_name=id_column_name, id_column_type=id_column_type,
-                        sqlite3_column_types=sqlite3_column_types)
-                # Check that each rows satisfies criteria of the first row.
-                [Exception(f"{column} is expected to be in row!") for column in row_columns if column not in columns]
-                if skip_existed:
-                    r = cur.execute(f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE [{id_column_name}]='{uid}');")
-                    ans = r.fetchone()[0]
-                    if ans == 1:
-                        continue
-                params = ", ".join(tuple(['?'] * (len(columns))))
-                row_columns_str = ", ".join([f"[{col}]" for col in row_columns])
-                content_list = row_params_func()
-                cur.execute(f"INSERT INTO {table_name}({row_columns_str}) VALUES ({params})", content_list)
-                con.commit()
-            cur.close()
-    @staticmethod
-    def read(src, table="content", **connect_kwargs):
-        with sqlite3.connect(src, **connect_kwargs) as conn:
-            cursor = conn.cursor()
-            cursor.execute(f"SELECT * FROM {table}")
-            for record_list in SQLite3Service.__it_row_lists(cursor):
-                yield record_list
-    @staticmethod
-    def read_columns(target, table="content", **connect_kwargs):
-        with sqlite3.connect(target, **connect_kwargs) as conn:
-            cursor = conn.cursor()
-            cursor.execute(f"PRAGMA table_info({table})")
-            return [row[1] for row in cursor.fetchall()]

bulk_chain/core/service_args.py DELETED Viewed

@@ -1,72 +0,0 @@
-class CmdArgsService:
-    @staticmethod
-    def autocast(v):
-        for t in [int, float, str]:
-            try:
-                return t(v)
-            except:
-                pass
-    @staticmethod
-    def iter_arguments(lst):
-        def __release():
-            # We use the True value by default to treat the related parameter as flag.
-            if len(buf) == 0:
-                buf.append(True)
-            return key, buf if len(buf) > 1 else buf[0]
-        key = None
-        buf = []
-        for a in lst:
-            if a.startswith('--'):
-                # release
-                if key is not None:
-                    yield __release()
-                # set new key and empty buf
-                key = a[2:]
-                buf = []
-            else:
-                # append argument into buffer.
-                buf.append(a)
-        # Sharing the remaining params.
-        if key is not None:
-            yield __release()
-    @staticmethod
-    def __find_suffix_ind(lst, idx_from, end_prefix):
-        for i in range(idx_from, len(lst)):
-            if lst[i].startswith(end_prefix):
-                return i
-        return len(lst)
-    @staticmethod
-    def extract_native_args(lst, end_prefix):
-        return lst[:CmdArgsService.__find_suffix_ind(lst, idx_from=0, end_prefix=end_prefix)]
-    @staticmethod
-    def find_grouped_args(lst, starts_with, end_prefix):
-        """Slices a list in two, cutting on index matching "sep"
-        """
-        # Checking the presence of starts_with.
-        # We have to return empty content in the case of absence starts_with in the lst.
-        if starts_with not in lst:
-            return []
-        # Assigning start index.
-        idx_from = lst.index(starts_with) + 1
-        # Assigning end index.
-        idx_to = CmdArgsService.__find_suffix_ind(lst, idx_from=idx_from, end_prefix=end_prefix)
-        return lst[idx_from:idx_to]
-    @staticmethod
-    def args_to_dict(args):
-        return {k: CmdArgsService.autocast(v) if not isinstance(v, list) else v
-                for k, v in CmdArgsService.iter_arguments(args)} if args is not None else {}

bulk_chain/core/service_llm.py DELETED Viewed

@@ -1,68 +0,0 @@
-from bulk_chain.api import iter_content
-from bulk_chain.core.llm_base import BaseLM
-from bulk_chain.core.utils_logger import StreamedLogger
-def pad_str(text, pad):
-    return text.rjust(len(text) + pad, ' ')
-def nice_output(text, remove_new_line=False):
-    short_text = text.replace("\n", "") if remove_new_line else text
-    return short_text
-def chat_with_lm(lm, preset_dict=None, schema=None, model_name=None, pad=0):
-    assert (isinstance(lm, BaseLM))
-    assert (isinstance(model_name, str) or model_name is None)
-    preset_dict = {} if preset_dict is None else preset_dict
-    streamed_logger = StreamedLogger(__name__)
-    do_exit = False
-    model_name = model_name if model_name is not None else "agent"
-    while not do_exit:
-        # Launching the CoT engine loop.
-        data_dict = {} | preset_dict
-        def callback_str_func(entry, info):
-            streamed_logger.info(pad_str(f"{model_name} ({info['param']})->\n", pad=pad))
-            streamed_logger.info(nice_output(entry, remove_new_line=False))
-            streamed_logger.info("\n\n")
-        def handle_missed_value(col_name):
-            user_input = input(f"Enter your prompt for `{col_name}`"
-                               f"(or 'exit' to quit): ")
-            if user_input.lower() == 'exit':
-                exit(0)
-            return user_input
-        def callback_stream_func(entry, info):
-            if entry is None and info["action"] == "start":
-                streamed_logger.info(pad_str(f"{model_name} ({info['param']})->\n", pad=pad))
-                return
-            if entry is None and info["action"] == "end":
-                streamed_logger.info("\n\n")
-                return
-            streamed_logger.info(entry)
-        content_it = iter_content(
-            input_dicts_it=[data_dict],
-            llm=lm,
-            schema=schema,
-            batch_size=1,
-            return_batch=True,
-            handle_missed_value_func=handle_missed_value,
-            callback_str_func=callback_str_func,
-            callback_stream_func=callback_stream_func,
-        )
-        for _ in content_it:
-            user_input = input(f"Enter to continue (or 'exit' to quit) ...\n")
-            if user_input.lower() == 'exit':
-                do_exit = True

bulk_chain/core/utils_logger.py DELETED Viewed

@@ -1,41 +0,0 @@
-import logging
-def StreamedLogger(name: str) -> logging.Logger:
-    """ https://medium.com/@r.das699/optimizing-logging-practices-for-streaming-data-in-python-521798e1ed82
-    """
-    root_handlers = logging.getLogger().handlers
-    current_logger = logging.getLogger(name)
-    if not root_handlers:
-        new_handler = logging.StreamHandler()
-        new_handler.terminator = ""
-        new_handler.setFormatter(logging.Formatter("%(message)s"))
-        current_logger.addHandler(new_handler)
-        current_logger.propagate = False
-        current_logger.setLevel(logging.INFO)
-        return current_logger
-    for handler in current_logger.handlers[:]:
-        current_logger.removeHandler(handler)
-    for handler_r in root_handlers:
-        if type(handler_r) is logging.StreamHandler:
-            new_handler = logging.StreamHandler()
-            new_handler.terminator = ""
-            new_handler.setFormatter(logging.Formatter("%(message)s"))
-            current_logger.addHandler(new_handler)
-        elif type(handler_r) is logging.FileHandler:
-            new_handler = logging.FileHandler(
-                handler_r.baseFilename,
-                handler_r.mode,
-                handler_r.encoding,
-                handler_r.delay,
-                handler_r.errors,
-            )
-            new_handler.terminator = ""  # This will stop the printing in new line
-            new_handler.setFormatter(logging.Formatter("%(message)s"))
-            current_logger.addHandler(new_handler)
-        else:
-            continue
-    current_logger.propagate = False  # Don't propagate to root logger
-    return current_logger

bulk_chain/demo.py DELETED Viewed

@@ -1,84 +0,0 @@
-import json
-import argparse
-import logging
-import sys
-from source_iter.service_jsonl import JsonlService
-from bulk_chain.api import init_llm
-from bulk_chain.core.service_args import CmdArgsService
-from bulk_chain.core.service_json import JsonService
-from bulk_chain.core.service_llm import chat_with_lm
-from bulk_chain.core.service_schema import SchemaService
-from bulk_chain.core.utils import parse_filepath
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-def iter_from_json(filepath):
-    with open(filepath, "r") as f:
-        content = json.load(f)
-        for key, value in content.items():
-            yield key, value
-def iter_from_text_file(filepath):
-    with open(filepath, "r") as f:
-        yield filepath.split('.')[0], f.read()
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="LLM demo usage based on CoT schema")
-    parser.add_argument('--adapter', dest='adapter', type=str, default=None)
-    parser.add_argument('--attempts', dest='attempts', type=int, default=None)
-    parser.add_argument('--src', dest='src', type=str, nargs="*", default=None)
-    parser.add_argument('--schema', dest='schema', type=str, default=None,
-                        help="Path to the JSON file that describes schema")
-    parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
-                        help="Optional trimming prompt by the specified amount of characters.")
-    # Extract native arguments.
-    native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
-    args = parser.parse_args(args=native_args[1:])
-    # Extract model-related arguments and Initialize Large Language Model.
-    model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
-    model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
-    llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
-    # Setup schema.
-    schema = SchemaService(json_data=JsonService.read(args.schema))
-    schema_name = schema.src.get("name", None)
-    if schema is not None:
-        logger.info(f"Using schema: {schema_name}")
-    output_providers = {
-        "jsonl": lambda filepath, data_it, header:
-            JsonlService.write(target=filepath,
-                               data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
-    }
-    input_file_handlers = {
-        "json": lambda filepath: iter_from_json(filepath),
-        "txt": lambda filepath: iter_from_text_file(filepath)
-    }
-    # Input extension type defines the provider.
-    if args.src is None:
-        args.src = []
-    if isinstance(args.src, str):
-        args.src = [args.src]
-    sources = [parse_filepath(s) for s in args.src]
-    preset_dict = {}
-    for fp, ext, _ in sources:
-        for key, value in input_file_handlers[ext](fp):
-            if key in preset_dict:
-                raise Exception(f"While at handling {fp}: Key {key} is already registered!")
-            preset_dict[key] = value
-    # Launch Demo.
-    chat_with_lm(llm, preset_dict=preset_dict, schema=schema, model_name=llm_model_name)

bulk_chain/infer.py DELETED Viewed

@@ -1,193 +0,0 @@
-from itertools import chain
-from os.path import join, basename
-import argparse
-import logging
-import sys
-from source_iter.service_csv import CsvService
-from source_iter.service_jsonl import JsonlService
-from tqdm import tqdm
-from bulk_chain.api import INFER_MODES, _infer_batch, CWD, init_llm
-from bulk_chain.core.llm_base import BaseLM
-from bulk_chain.core.provider_sqlite import SQLite3Service
-from bulk_chain.core.service_args import CmdArgsService
-from bulk_chain.core.service_batch import BatchIterator
-from bulk_chain.core.service_dict import DictionaryService
-from bulk_chain.core.service_json import JsonService
-from bulk_chain.core.service_schema import SchemaService
-from bulk_chain.core.utils import handle_table_name, optional_limit_iter, parse_filepath
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-WRITER_PROVIDERS = {
-    "sqlite": lambda filepath, table_name, data_it, **kwargs: SQLite3Service.write(
-        data_it=data_it, target=filepath, table_name=table_name, skip_existed=True, **kwargs)
-}
-READER_PROVIDERS = {
-    "sqlite": lambda filepath, table_name: SQLite3Service.read(filepath, table=table_name)
-}
-def infer_batch(batch, columns=None, **kwargs):
-    assert (len(batch) > 0)
-    # TODO. Support proper selection of columns.
-    cols = batch[0].keys() if columns is None else columns
-    return _infer_batch(batch=batch, cols=cols, **kwargs)
-def raise_(ex):
-    raise ex
-def iter_content_cached(input_dicts_it, llm, schema, cache_target, batch_size, id_column_name, limit_prompt=None,
-                        **cache_kwargs):
-    assert (isinstance(llm, BaseLM))
-    assert (isinstance(cache_target, str))
-    # Quick initialization of the schema.
-    if isinstance(schema, str):
-        schema = JsonService.read(schema)
-    if isinstance(schema, dict):
-        schema = SchemaService(json_data=schema)
-    # Parse target.
-    cache_filepath, _, cache_table = parse_filepath(filepath=cache_target)
-    # Iterator of the queries.
-    prompts_it = map(
-        lambda data: DictionaryService.custom_update(src_dict=data, other_dict=schema.cot_args),
-        input_dicts_it
-    )
-    prompts_batched_it = BatchIterator(
-        data_iter=iter(tqdm(prompts_it, desc="Iter Content")),
-        batch_size=batch_size,
-        filter_func=lambda data: not SQLite3Service.entry_exist(
-            id_column_name=id_column_name, table_name=cache_table, target=cache_filepath,
-            id_value=data[id_column_name], **cache_kwargs)
-    )
-    results_it = map(
-        lambda batch: infer_batch(
-            batch=batch, schema=schema,
-            handle_batch_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
-            handle_missed_value_func=lambda col_name: raise_(
-                Exception(f"Value for {col_name} is undefined. Filling undefined values is not supported")
-            )
-        ),
-        prompts_batched_it
-    )
-    # Perform caching first.
-    WRITER_PROVIDERS["sqlite"](
-        filepath=cache_filepath,
-        table_name=cache_table,
-        data_it=chain.from_iterable(results_it),
-        id_column_name=id_column_name,
-        **cache_kwargs)
-    # Then retrieve data.
-    return READER_PROVIDERS["sqlite"](filepath=cache_filepath, table_name=cache_table)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Infer Instruct LLM inference based on CoT schema")
-    parser.add_argument('--adapter', dest='adapter', type=str, default=None)
-    parser.add_argument('--id-col', dest='id_col', type=str, default="uid")
-    parser.add_argument('--src', dest='src', type=str, nargs="?", default=None)
-    parser.add_argument('--schema', dest='schema', type=str, default=None,
-                        help="Path to the JSON file that describes schema")
-    parser.add_argument('--to', dest='to', type=str, default=None, choices=["csv", "sqlite"])
-    parser.add_argument('--output', dest='output', type=str, default=None)
-    parser.add_argument('--limit', dest='limit', type=int, default=None,
-                        help="Limit amount of source texts for prompting.")
-    parser.add_argument('--batch-size', dest='batch_size', type=int, default=1)
-    parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
-                        help="Optional trimming prompt by the specified amount of characters.")
-    # Extract native arguments.
-    native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
-    args = parser.parse_args(args=native_args[1:])
-    # Extract csv-related arguments.
-    csv_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%csv", end_prefix="%%")
-    csv_args_dict = CmdArgsService.args_to_dict(csv_args)
-    # Extract model-related arguments and Initialize Large Language Model.
-    model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
-    model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": 1}
-    llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
-    # Setup schema.
-    schema = SchemaService(json_data=JsonService.read(args.schema))
-    schema_name = schema.src.get("name", None)
-    if schema is not None:
-        logger.info(f"Using schema: {schema_name}")
-    input_providers = {
-        "csv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
-                                                as_dict=True, skip_header=True,
-                                                delimiter=csv_args_dict.get("delimiter", ","),
-                                                escapechar=csv_args_dict.get("escapechar", None)),
-        "tsv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
-                                                as_dict=True, skip_header=True,
-                                                delimiter=csv_args_dict.get("delimiter", "\t"),
-                                                escapechar=csv_args_dict.get("escapechar", None)),
-        "jsonl": lambda filepath: JsonlService.read(src=filepath, row_id_key=args.id_col)
-    }
-    output_providers = {
-        "csv": lambda filepath, data_it, header: CsvService.write(target=filepath,
-                                                                  data_it=data_it, header=header,
-                                                                  delimiter=csv_args_dict.get("delimiter", ","),
-                                                                  escapechar=csv_args_dict.get("escapechar", None),
-                                                                  it_type=None),
-        "tsv": lambda filepath, data_it, header: CsvService.write(target=filepath,
-                                                                  data_it=data_it, header=header,
-                                                                  delimiter=csv_args_dict.get("delimiter", "\t"),
-                                                                  escapechar=csv_args_dict.get("escapechar", None),
-                                                                  it_type=None),
-        "jsonl": lambda filepath, data_it, header:
-        JsonlService.write(target=filepath,
-                           data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
-    }
-    # Setup output.
-    args.output = args.output.format(model=llm.name()) if args.output is not None else args.output
-    tgt_filepath, tgt_ext, tgt_meta = parse_filepath(args.output, default_ext=args.to)
-    # We do not support multiple files for other modes.
-    src_filepath, src_ext, src_meta = parse_filepath(args.src)
-    def default_output_file_template(ext):
-        # This is a default template for output files to be generated.
-        return "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema_name]), ext])
-    # Setup cache target as well as the related table.
-    cache_filepath = default_output_file_template(".sqlite") if tgt_filepath is None else tgt_filepath
-    cache_table = handle_table_name(tgt_meta if tgt_meta is not None else "contents")
-    # This is a content that we extracted via input provider.
-    it_data = input_providers[src_ext](src_filepath)
-    data_it = iter_content_cached(input_dicts_it=optional_limit_iter(it_data=it_data, limit=args.limit),
-                                  limit_prompt=args.limit_prompt,
-                                  schema=schema,
-                                  llm=llm,
-                                  batch_size=args.batch_size,
-                                  id_column_name=args.id_col,
-                                  cache_target=":".join([cache_filepath, cache_table]))
-    # Setup output target
-    tgt_ext = src_ext if tgt_ext is None else tgt_ext
-    output_target = default_output_file_template(f".{tgt_ext}") if tgt_filepath is None else tgt_filepath
-    # Perform output writing process.
-    output_providers[tgt_ext](filepath=output_target,
-                              data_it=data_it,
-                              header=SQLite3Service.read_columns(target=cache_filepath, table=cache_table))

bulk_chain-0.25.3.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/api.py,sha256=8hXJb66bEOf1izgeBmjrB9LexSMJD7GquUhIm76lfmY,4351
-bulk_chain/demo.py,sha256=20r_-ioR3fu3eqHJnCRK4aQmBKTMgjFAHzZPJcXaEz8,3186
-bulk_chain/infer.py,sha256=Qb7ZV3_DXIh1jQoEjER_H2rO4ntREj_iIwGpEpAXJCE,9157
-bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/core/llm_base.py,sha256=fuWxfEOSRYvnoZMOcfnq1E2LIJKnrpsnxQ1z6SmY1nM,1839
-bulk_chain/core/provider_sqlite.py,sha256=sW4Yefp_zYuL8xVys8la5hG0Ng94jSqiXelPgGGB5B0,5327
-bulk_chain/core/service_args.py,sha256=lq4Veuh4QNu8mlCv8MT9S1rMxTn4FKalyp-3boYonVk,2136
-bulk_chain/core/service_batch.py,sha256=z1OND6x40QBvK2H_Wt8sRS8MadUvYTCHmbS_dCm9t7M,1678
-bulk_chain/core/service_data.py,sha256=OWWHHnr_plwxYTxLuvMrhEc1PbSx-XC3rbFzV0hy3vk,1107
-bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
-bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
-bulk_chain/core/service_llm.py,sha256=3WYoBgaiqoRwsoKq6VUUNMasbLj5rMCjGkU3OQVxGf8,2278
-bulk_chain/core/service_schema.py,sha256=KIP4n0Tz2h1i7SIMGhgAhoiCgUFXOT1rzMt38yACS2U,1154
-bulk_chain/core/utils.py,sha256=1irk3RGLzJxeLZ-Tv0oOceOKG0ADtpEWniG2UGbYA_U,3089
-bulk_chain/core/utils_logger.py,sha256=BD-ADxaeeuHztaYjqtIY_cIzc5r2Svq9XwRtrgIEqyI,1636
-bulk_chain-0.25.3.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
-bulk_chain-0.25.3.dist-info/METADATA,sha256=-yt6rNoJSGqHRe-6z-mdf51wtwpVLYvcr8Yv_1qkdxQ,6037
-bulk_chain-0.25.3.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
-bulk_chain-0.25.3.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
-bulk_chain-0.25.3.dist-info/RECORD,,

{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{bulk_chain-0.25.3.dist-info → bulk_chain-1.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

bulk-chain 0.25.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

bulk-chain 0.25.3py3-none-any.whl → 1.1.0py3-none-any.whl