PyPI - bulk-chain - Versions diffs - 0.24.1__py3-none-any.whl → 0.25.0__py3-none-any.whl - Mend

bulk-chain 0.24.1py3-none-any.whl → 0.25.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

bulk_chain/api.py +79 -0
bulk_chain/core/llm_base.py +42 -3
bulk_chain/core/service_args.py +25 -6
bulk_chain/core/service_batch.py +51 -0
bulk_chain/core/service_data.py +4 -0
bulk_chain/core/service_dict.py +10 -0
bulk_chain/core/service_json.py +2 -18
bulk_chain/core/service_llm.py +9 -9
bulk_chain/core/service_schema.py +1 -2
bulk_chain/infer.py +90 -72
{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/METADATA +37 -25
bulk_chain-0.25.0.dist-info/RECORD +18 -0
bulk_chain/core/provider_sqlite.py +0 -78
bulk_chain/core/service_csv.py +0 -57
bulk_chain-0.24.1.dist-info/RECORD +0 -17
{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/LICENSE +0 -0
{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/WHEEL +0 -0
{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/top_level.txt +0 -0

bulk_chain/api.py ADDED Viewed

@@ -0,0 +1,79 @@
+import os
+from itertools import chain
+from bulk_chain.core.llm_base import BaseLM
+from bulk_chain.core.service_batch import BatchIterator, BatchService
+from bulk_chain.core.service_data import DataService
+from bulk_chain.core.service_dict import DictionaryService
+from bulk_chain.core.service_json import JsonService
+from bulk_chain.core.service_schema import SchemaService
+INFER_MODES = {
+    "default": lambda llm, prompt, limit_prompt=None: llm.ask_core(
+        prompt[:limit_prompt] if limit_prompt is not None else prompt),
+    "batch": lambda llm, batch, limit_prompt=None: llm.ask_core(
+        DataService.limit_prompts(batch, limit=limit_prompt))
+}
+CWD = os.getcwd()
+def _update_batch_content(c, batch, schema, infer_func):
+    assert (isinstance(batch, list))
+    assert (isinstance(c, str))
+    if c in schema.p2r:
+        for batch_item in batch:
+            batch_item[c] = DataService.get_prompt_text(prompt=batch_item[c]["prompt"], data_dict=batch_item)
+    if c in schema.r2p:
+        p_column = schema.r2p[c]
+        # This instruction takes a lot of time in a non-batching mode.
+        BatchService.handle_param_as_batch(batch=batch,
+                                           src_param=p_column,
+                                           tgt_param=c,
+                                           handle_func=lambda b: infer_func(b))
+def _infer_batch(batch, schema, infer_func, cols=None):
+    assert (isinstance(batch, list))
+    assert (callable(infer_func))
+    if len(batch) == 0:
+        return batch
+    if cols is None:
+        first_item = batch[0]
+        cols = first_item.keys() if cols is None else cols
+    for c in cols:
+        _update_batch_content(c=c, batch=batch, schema=schema, infer_func=infer_func)
+    return batch
+def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, limit_prompt=None):
+    """ This method represent Python API aimed at application of `llm` towards
+        iterator of input_dicts via cache_target that refers to the SQLite using
+        the given `schema`
+    """
+    assert (isinstance(llm, BaseLM))
+    # Quick initialization of the schema.
+    if isinstance(schema, str):
+        schema = JsonService.read(schema)
+    if isinstance(schema, dict):
+        schema = SchemaService(json_data=schema)
+    prompts_it = map(
+        lambda data: DictionaryService.custom_update(src_dict=data, other_dict=schema.cot_args),
+        input_dicts_it
+    )
+    content_it = (_infer_batch(batch=batch,
+                               infer_func=lambda batch: INFER_MODES["batch"](llm, batch, limit_prompt),
+                               schema=schema)
+                  for batch in BatchIterator(prompts_it, batch_size=batch_size))
+    yield from content_it if return_batch else chain.from_iterable(content_it)

bulk_chain/core/llm_base.py CHANGED Viewed

@@ -1,13 +1,52 @@
+import logging
+import time
 from bulk_chain.core.utils import format_model_name
 class BaseLM(object):
-    def __init__(self, name):
+    def __init__(self, name=None, attempts=None, delay_sec=1, enable_log=True,
+                 support_batching=False, **kwargs):
         self.__name = name
+        self.__attempts = 1 if attempts is None else attempts
+        self.__delay_sec = delay_sec
+        self.__support_batching = support_batching
+        if enable_log:
+            self.__logger = logging.getLogger(__name__)
+            logging.basicConfig(level=logging.INFO)
+    def ask_core(self, batch):
+        for i in range(self.__attempts):
+            try:
+                if self.__support_batching:
+                    # Launch in batch mode.
+                    content = self.ask(batch)
+                else:
+                    # Launch in non-batch mode.
+                    assert len(batch) == 1, "The LM does not support batching," \
+                                            f" while size of the content is {len(batch)} which is not equal 1. " \
+                                            f"Please enable batch-supporting or set required inference settings."
+                    content = batch[0]
+                response = self.ask(content)
+                # Wrapping into batch the response in the case of non-batching mode.
+                return response if self.__support_batching else [response]
+            except Exception as e:
+                if self.__logger is not None:
+                    self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
+                    self.__logger.info(e)
+                time.sleep(self.__delay_sec)
+        raise Exception("Can't infer")
-    def ask(self, prompt):
+    def ask(self, content):
         raise NotImplemented()
     def name(self):
-        return format_model_name(self.__name)
+        return format_model_name(self.__name)

bulk_chain/core/service_args.py CHANGED Viewed

@@ -33,14 +33,33 @@ class CmdArgsService:
             yield __release()
     @staticmethod
-    def partition_list(lst, sep):
+    def __find_suffix_ind(lst, idx_from, end_prefix):
+        for i in range(idx_from, len(lst)):
+            if lst[i].startswith(end_prefix):
+                return i
+        return len(lst)
+    @staticmethod
+    def extract_native_args(lst, end_prefix):
+        return lst[:CmdArgsService.__find_suffix_ind(lst, idx_from=0, end_prefix=end_prefix)]
+    @staticmethod
+    def find_grouped_args(lst, starts_with, end_prefix):
         """Slices a list in two, cutting on index matching "sep"
         """
-        if sep in lst:
-            idx = lst.index(sep)
-            return (lst[:idx], lst[idx+1:])
-        else:
-            return (lst[:], None)
+        # Checking the presence of starts_with.
+        # We have to return empty content in the case of absence starts_with in the lst.
+        if starts_with not in lst:
+            return []
+        # Assigning start index.
+        idx_from = lst.index(starts_with) + 1
+        # Assigning end index.
+        idx_to = CmdArgsService.__find_suffix_ind(lst, idx_from=idx_from, end_prefix=end_prefix)
+        return lst[idx_from:idx_to]
     @staticmethod
     def args_to_dict(args):

bulk_chain/core/service_batch.py ADDED Viewed

@@ -0,0 +1,51 @@
+class BatchService(object):
+    @staticmethod
+    def handle_param_as_batch(batch, src_param, tgt_param, handle_func):
+        assert (isinstance(batch, list))
+        assert (isinstance(src_param, str))
+        assert (callable(handle_func))
+        _batch = [item[src_param] for item in batch]
+        # Do handling for the batch.
+        _handled_batch = handle_func(_batch)
+        assert (isinstance(_handled_batch, list))
+        # Apply changes.
+        for i, item in enumerate(batch):
+            item[tgt_param] = _handled_batch[i]
+class BatchIterator:
+    def __init__(self, data_iter, batch_size, end_value=None):
+        assert(isinstance(batch_size, int) and batch_size > 0)
+        assert(callable(end_value) or end_value is None)
+        self.__data_iter = data_iter
+        self.__index = 0
+        self.__batch_size = batch_size
+        self.__end_value = end_value
+    def __iter__(self):
+        return self
+    def __next__(self):
+        buffer = []
+        while True:
+            try:
+                data = next(self.__data_iter)
+            except StopIteration:
+                break
+            buffer.append(data)
+            if len(buffer) == self.__batch_size:
+                break
+        if len(buffer) > 0:
+            self.__index += 1
+            return buffer
+        if self.__end_value is None:
+            raise StopIteration
+        else:
+            return self.__end_value()

bulk_chain/core/service_data.py CHANGED Viewed

@@ -20,3 +20,7 @@ class DataService(object):
         field_names = list(parse_fields_func(prompt))
         return DataService.compose_prompt_text(
             prompt=prompt, data_dict=data_dict, field_names=field_names)
+    @staticmethod
+    def limit_prompts(prompts_list, limit=None):
+        return [p[:limit] if limit is not None else p for p in prompts_list]

bulk_chain/core/service_dict.py ADDED Viewed

@@ -0,0 +1,10 @@
+class DictionaryService:
+    @staticmethod
+    def custom_update(src_dict, other_dict):
+        for k, v in other_dict.items():
+            if k in src_dict:
+                raise Exception(f"The key `{k}` is already defined in both dicts with values: "
+                                f"`{src_dict[k]}` (src) and `{v}` (other)")
+            src_dict[k] = v
+        return src_dict

bulk_chain/core/service_json.py CHANGED Viewed

@@ -4,23 +4,7 @@ import json
 class JsonService(object):
     @staticmethod
-    def read_data(src):
+    def read(src):
         assert (isinstance(src, str))
         with open(src, "r") as f:
-            return json.load(f)
-    @staticmethod
-    def read_lines(src, row_id_key=None):
-        assert (isinstance(src, str))
-        with open(src, "r") as f:
-            for line_ind, line in enumerate(f.readlines()):
-                content = json.loads(line)
-                if row_id_key is not None:
-                    content[row_id_key] = line_ind
-                yield content
-    @staticmethod
-    def write_lines(target, data_it):
-        with open(target, "w") as f:
-            for item in data_it:
-                f.write(f"{json.dumps(item, ensure_ascii=False)}\n")
+            return json.load(f)

bulk_chain/core/service_llm.py CHANGED Viewed

@@ -4,9 +4,6 @@ from bulk_chain.core.llm_base import BaseLM
 from bulk_chain.core.service_data import DataService
 from bulk_chain.core.utils import iter_params
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
 def pad_str(text, pad):
     return text.rjust(len(text) + pad, ' ')
@@ -27,9 +24,12 @@ def nice_output(text, width, pad=4, remove_new_line=False):
 def chat_with_lm(lm, chain=None, model_name=None):
-    assert(isinstance(lm, BaseLM))
-    assert(isinstance(chain, list))
-    assert(isinstance(model_name, str) or model_name is None)
+    assert (isinstance(lm, BaseLM))
+    assert (isinstance(chain, list))
+    assert (isinstance(model_name, str) or model_name is None)
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(level=logging.INFO)
     do_exit = False
     model_name = model_name if model_name is not None else "agent"
@@ -74,9 +74,9 @@ def chat_with_lm(lm, chain=None, model_name=None):
             logger.info(nice_output(actual_prompt, pad=pad*2, remove_new_line=True, width=80))
             # Response.
-            response = lm.ask(actual_prompt)
+            response_batch = lm.ask_core(batch=[actual_prompt])
             logger.info(pad_str(f"{model_name} (resp)->", pad=pad))
-            logger.info(nice_output(response, pad=pad*2, remove_new_line=False, width=80))
+            logger.info(nice_output(response_batch[0], pad=pad * 2, remove_new_line=False, width=80))
             # Collecting the answer for the next turn.
-            data_dict[prompt_args["out"]] = response
+            data_dict[prompt_args["out"]] = response_batch[0]

bulk_chain/core/service_schema.py CHANGED Viewed

@@ -2,12 +2,11 @@ class SchemaService(object):
     def __init__(self, json_data):
         self.src = json_data
-        self.name = self.src["name"]
         self.r2p, self.p2r, self.cot_args, self.chain = SchemaService.__init_schema(prompts=json_data["schema"])
     @classmethod
     def from_prompt(cls, prompt):
-        prompt_schema = {"name": "prompt", "schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
+        prompt_schema = {"schema": [{"prompt": prompt, "out": "response", "in": "prompt"}]}
         return cls(prompt_schema)
     @staticmethod

bulk_chain/infer.py CHANGED Viewed

@@ -1,17 +1,18 @@
+from os.path import join, basename
 import argparse
 import logging
-import os
 import sys
+from source_iter.service_csv import CsvService
+from source_iter.service_jsonl import JsonlService
+from source_iter.service_sqlite import SQLite3Service
 from tqdm import tqdm
-from os.path import join, basename
+from bulk_chain.api import INFER_MODES, _infer_batch, CWD
 from bulk_chain.core.llm_base import BaseLM
-from bulk_chain.core.provider_sqlite import SQLiteProvider
 from bulk_chain.core.service_args import CmdArgsService
-from bulk_chain.core.service_csv import CsvService
-from bulk_chain.core.service_data import DataService
+from bulk_chain.core.service_dict import DictionaryService
 from bulk_chain.core.service_json import JsonService
 from bulk_chain.core.service_llm import chat_with_lm
 from bulk_chain.core.service_schema import SchemaService
@@ -21,7 +22,16 @@ logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
-CWD = os.getcwd()
+WRITER_PROVIDERS = {
+    "sqlite": lambda filepath, table_name, data_it, infer_data_func, **kwargs: SQLite3Service.write(
+        data_it=data_it, target=filepath, table_name=table_name, data2col_func=infer_data_func,
+        skip_existed=True, **kwargs)
+}
+READER_PROVIDERS = {
+    "sqlite": lambda filepath, table_name: SQLite3Service.read(filepath, table=table_name)
+}
 def init_llm(**model_kwargs):
@@ -44,59 +54,44 @@ def init_llm(**model_kwargs):
     return llm, llm_model_name
-def init_schema(json_filepath):
-    return SchemaService(json_data=JsonService.read_data(json_filepath))
-def iter_content(input_dicts_iter, llm, schema, cache_target, cache_table, id_column_name):
-    """ This method represent Python API aimed at application of `llm` towards
-        iterator of input_dicts via cache_target that refers to the SQLite using
-        the given `schema`
-    """
+def iter_content_cached(input_dicts_it, llm, schema, cache_target, limit_prompt=None, **cache_kwargs):
     assert (isinstance(llm, BaseLM))
-    assert (isinstance(schema, SchemaService))
     assert (isinstance(cache_target, str))
-    assert (isinstance(cache_table, str))
-    infer_modes = {
-        "default": lambda prompt: llm.ask(prompt[:args.limit_prompt] if args.limit_prompt is not None else prompt)
-    }
+    # Quick initialization of the schema.
+    if isinstance(schema, str):
+        schema = JsonService.read(schema)
+    if isinstance(schema, dict):
+        schema = SchemaService(json_data=schema)
-    def optional_update_data_records(c, data):
-        assert (isinstance(c, str))
+    # Iterator of the queries.
+    prompts_it = map(
+        lambda data: DictionaryService.custom_update(src_dict=data, other_dict=schema.cot_args),
+        input_dicts_it
+    )
-        if c in schema.p2r:
-            data[c] = DataService.get_prompt_text(prompt=data[c]["prompt"], data_dict=data)
-        if c in schema.r2p:
-            p_column = schema.r2p[c]
-            # This instruction takes a lot of time in a non-batching mode.
-            data[c] = infer_modes["default"](data[p_column])
+    # Parse target.
+    cache_filepath, _, cache_table = parse_filepath(filepath=cache_target)
-        return data[c]
+    # Perform caching first.
+    WRITER_PROVIDERS["sqlite"](
+        filepath=cache_filepath, table_name=cache_table,
+        data_it=tqdm(prompts_it, desc="Iter content"),
+        infer_data_func=lambda c, prompt: _infer_batch(
+            batch=[prompt], cols=[c],
+            infer_func=lambda batch: INFER_MODES["default"](llm, batch, limit_prompt),
+            schema=schema)[0][c],
+        **cache_kwargs)
-    cache_providers = {
-        "sqlite": lambda filepath, table_name, data_it: SQLiteProvider.write_auto(
-            data_it=data_it, target=filepath,
-            data2col_func=optional_update_data_records,
-            table_name=handle_table_name(table_name if table_name is not None else "contents"),
-            id_column_name=id_column_name)
-    }
-    # We optionally wrap into limiter.
-    queries_it = optional_limit_iter(
-        it_data=map(lambda data: data.update(schema.cot_args) or data, input_dicts_iter),
-        limit=args.limit)
-    # Provide data caching.
-    cache_providers["sqlite"](cache_target, table_name=tgt_meta, data_it=tqdm(queries_it, desc="Iter content"))
-    return SQLiteProvider.iter_rows(cache_target, table=cache_table)
+    # Then retrieve data.
+    return READER_PROVIDERS["sqlite"](filepath=cache_filepath, table_name=cache_table)
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Infer Instruct LLM inference based on CoT schema")
     parser.add_argument('--adapter', dest='adapter', type=str, default=None)
+    parser.add_argument('--attempts', dest='attempts', type=int, default=None)
     parser.add_argument('--id-col', dest='id_col', type=str, default="uid")
     parser.add_argument('--src', dest='src', type=str, default=None)
     parser.add_argument('--schema', dest='schema', type=str, default=None,
@@ -108,34 +103,52 @@ if __name__ == '__main__':
     parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
                         help="Optional trimming prompt by the specified amount of characters.")
-    native_args, model_args = CmdArgsService.partition_list(lst=sys.argv, sep="%%")
+    # Extract native arguments.
+    native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
     args = parser.parse_args(args=native_args[1:])
-    # Initialize Large Language Model.
-    model_args_dict = CmdArgsService.args_to_dict(model_args)
+    # Extract csv-related arguments.
+    csv_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%csv", end_prefix="%%")
+    csv_args_dict = CmdArgsService.args_to_dict(csv_args)
+    # Extract model-related arguments and Initialize Large Language Model.
+    model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
+    model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
     llm, llm_model_name = init_llm(**model_args_dict)
     # Setup schema.
-    schema = init_schema(args.schema)
+    schema = SchemaService(json_data=JsonService.read(args.schema))
+    schema_name = schema.src.get("name", None)
     if schema is not None:
-        logger.info(f"Using schema: {schema.name}")
+        logger.info(f"Using schema: {schema_name}")
     input_providers = {
         None: lambda _: chat_with_lm(llm, chain=schema.chain, model_name=llm_model_name),
-        "csv": lambda filepath: CsvService.read(target=filepath, row_id_key=args.id_col,
+        "csv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
+                                                as_dict=True, skip_header=True,
+                                                delimiter=csv_args_dict.get("delimiter", ","),
+                                                escapechar=csv_args_dict.get("escapechar", None)),
+        "tsv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
                                                 as_dict=True, skip_header=True,
-                                                delimiter=model_args_dict.get("delimiter", "\t"),
-                                                escapechar=model_args_dict.get("escapechar", None)),
-        "jsonl": lambda filepath: JsonService.read_lines(src=filepath, row_id_key=args.id_col)
+                                                delimiter=csv_args_dict.get("delimiter", "\t"),
+                                                escapechar=csv_args_dict.get("escapechar", None)),
+        "jsonl": lambda filepath: JsonlService.read(src=filepath, row_id_key=args.id_col)
     }
     output_providers = {
-        "csv": lambda filepath, data_it, header:
-        CsvService.write_handled(target=filepath, data_it=data_it, header=header, data2col_func=lambda v: list(v)),
+        "csv": lambda filepath, data_it, header: CsvService.write(target=filepath,
+                                                                  data_it=data_it, header=header,
+                                                                  delimiter=csv_args_dict.get("delimiter", ","),
+                                                                  escapechar=csv_args_dict.get("escapechar", None),
+                                                                  it_type=None),
+        "tsv": lambda filepath, data_it, header: CsvService.write(target=filepath,
+                                                                  data_it=data_it, header=header,
+                                                                  delimiter=csv_args_dict.get("delimiter", "\t"),
+                                                                  escapechar=csv_args_dict.get("escapechar", None),
+                                                                  it_type=None),
         "jsonl": lambda filepath, data_it, header:
-        JsonService.write_lines(target=filepath,
-                                data_it=map(lambda item: {key:item[i] for i, key in enumerate(header)}, data_it))
+        JsonlService.write(target=filepath,
+                           data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
     }
     # Setup output.
@@ -150,24 +163,29 @@ if __name__ == '__main__':
         input_providers[src_ext](None)
         exit(0)
+    def default_output_file_template(ext):
+        # This is a default template for output files to be generated.
+        return "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema_name]), ext])
     # Setup cache target as well as the related table.
-    cache_target = "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema.name]), f".sqlite"]) \
-        if tgt_filepath is None else tgt_filepath
+    cache_filepath = default_output_file_template(".sqlite") if tgt_filepath is None else tgt_filepath
     cache_table = handle_table_name(tgt_meta if tgt_meta is not None else "contents")
-    data_it = iter_content(input_dicts_iter=input_providers[src_ext](src_filepath),
-                           schema=schema,
-                           llm=llm,
-                           id_column_name=args.id_col,
-                           cache_target=cache_target,
-                           cache_table=cache_table)
+    # This is a content that we extracted via input provider.
+    it_data = input_providers[src_ext](src_filepath)
+    data_it = iter_content_cached(input_dicts_it=optional_limit_iter(it_data=it_data, limit=args.limit),
+                                  limit_prompt=args.limit_prompt,
+                                  schema=schema,
+                                  llm=llm,
+                                  id_column_name=args.id_col,
+                                  cache_target=":".join([cache_filepath, cache_table]))
     # Setup output target
     tgt_ext = src_ext if tgt_ext is None else tgt_ext
-    output_target = "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema.name]), f".{tgt_ext}"]) \
-        if tgt_filepath is None else tgt_filepath
+    output_target = default_output_file_template(f".{tgt_ext}") if tgt_filepath is None else tgt_filepath
     # Perform output writing process.
     output_providers[tgt_ext](filepath=output_target,
                               data_it=data_it,
-                              header=SQLiteProvider.get_columns(target=cache_target, table=cache_table))
+                              header=SQLite3Service.read_columns(target=cache_filepath, table=cache_table))

{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bulk_chain
-Version: 0.24.1
+Version: 0.25.0
 Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
 Home-page: https://github.com/nicolay-r/bulk-chain
 Author: Nicolay Rusnachenko
@@ -15,32 +15,42 @@ Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.6
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: tqdm
-# bulk-chain 0.24.1
+# bulk-chain 0.25.0
 ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
 [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
 [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
+[![PyPI downloads](https://img.shields.io/pypi/dm/bulk-chain.svg)](https://pypistats.org/packages/bulk-chain)
 <p align="center">
     <img src="logo.png"/>
 </p>
-A lightweight, no-strings-attached **[Chain-of-Thought](https://arxiv.org/abs/2201.11903) framework** for your LLM, ensuring reliable results for bulk input requests stored in `CSV` / `JSONL` / `sqlite`.
-It allows applying series of prompts formed into `schema` (See [related section](#chain-of-thought-schema))
+A lightweight, no-strings-attached **framework**  for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
-### Features
+### Main Features
 * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
-* ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
-* ✅ **Progress caching**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
 * ✅ **Support schemas descriptions** for Chain-of-Thought concept.
+* ✅ **Provides iterator over infinite amount of input contexts** served in `CSV`/`JSONL`.
+### Extra Features
+* ✅ **Progress caching [for remote LLMs]**: withstanding exception during LLM calls by using `sqlite3` engine for caching LLM answers;
 # Installation
+From PyPI:
 ```bash
 pip install bulk-chain
 ```
+or latest version from here:
+```bash
+pip install git+https://github.com/nicolay-r/bulk-chain@master
+```
 ## Chain-of-Thought Schema
 To declare Chain-of-Though (CoT) schema, this project exploits `JSON` format.
@@ -63,35 +73,37 @@ Below, is an example on how to declare your own schema:
 }
 ```
-Another templates are available [here](/ext/schema/thor_cot_schema.json).
+Another templates are available [here](/ext/schema/).
 # Usage
-Just **three** simple steps:
+Preliminary steps:
-1. Define your [CoT Schema](#chain-of-thought-schema), or fetch it as shown below:
-```bash
-!wget https://raw.githubusercontent.com/nicolay-r/bulk-chain/refs/heads/master/ext/schema/default.json
-```
-2. Fetch or write your own **model** or pick the one [preset here](/ext/):
-```bash
-!wget https://raw.githubusercontent.com/nicolay-r/bulk-chain/refs/heads/master/ext/flan_t5.py
-```
+1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
+2. Wrap or pick **LLM model** from the [list of presets](/ext/).
+## API
+Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/bulk-chain/wiki)
+## Shell
+> **NOTE:** You have to install `source-iter` package
-3. Launch inference in (chat mode):
 ```bash
-!python -m bulk_chain.infer \
-    --schema "default.json" \
-    --adapter "dynamic:flan_t5.py:FlanT5" \
-    %% \
-    --device "cpu" \
+python3 -m bulk_chain.infer \
+    --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
+    --schema "ext/schema/default.json" \
+    --adapter "dynamic:ext/replicate.py:Replicate" \
+    %%m \
+    --api_token "<REPLICATE-API-TOKEN>" \
     --temp 0.1
 ```
 # Embed your LLM
 All you have to do is to implement `BaseLM` class, that includes:
-* `__init__` -- for initialization;
+* `__init__` -- for setting up *batching mode support* and (optional) *model name*;
 * `ask(prompt)` -- infer your model with the given `prompt`.
 See examples with models [here](/ext).

bulk_chain-0.25.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/api.py,sha256=08i2tgFa_CCA0obC_Yr3rURI6MkuXYKgmuZaLcs4NLk,2807
+bulk_chain/infer.py,sha256=oWtBf2itZeM3fD-_QAzABKUMbsl4BqvHmW21TUTr880,9110
+bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/core/llm_base.py,sha256=uX_uibm5y8STfMKNYL64EeF8UowfJGwCD_t-uftHoJE,1849
+bulk_chain/core/service_args.py,sha256=x-QHaKLD1d6qaJkD4lNwx7640ku9-6Uyr3mooB_6kLc,1981
+bulk_chain/core/service_batch.py,sha256=yQr6fbQd4ifQBGMhZMrQQeZpXtDchMKMGJi8XPG7thc,1430
+bulk_chain/core/service_data.py,sha256=ZjJDtd1jrQm9hRCXMqe4CT_qF2XDbWBE1lVibP7tAWo,942
+bulk_chain/core/service_dict.py,sha256=lAghLU-3V3xYGv5BTA327Qcw8UJYmgQRMFdggzlrUgo,383
+bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
+bulk_chain/core/service_llm.py,sha256=1xbFW5OQY2ckKwIDZjsgNtnxKDp2wDjKKwyNS_yMU2s,2776
+bulk_chain/core/service_schema.py,sha256=KIP4n0Tz2h1i7SIMGhgAhoiCgUFXOT1rzMt38yACS2U,1154
+bulk_chain/core/utils.py,sha256=UV6Cefaw7yZiYblsCr-s9LsbcI83xe7eESBvha9A2Og,2784
+bulk_chain-0.25.0.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
+bulk_chain-0.25.0.dist-info/METADATA,sha256=-Ky6ZekXHUCBByhSTgDYgMpC64ew8lGmQ7-I9dKsv6U,3874
+bulk_chain-0.25.0.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
+bulk_chain-0.25.0.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
+bulk_chain-0.25.0.dist-info/RECORD,,

bulk_chain/core/provider_sqlite.py DELETED Viewed

@@ -1,78 +0,0 @@
-import sqlite3
-class SQLiteProvider(object):
-    @staticmethod
-    def __create_table(table_name, columns, id_column_name,
-                       id_column_type, sqlite3_column_types, cur):
-        # Provide the ID column.
-        sqlite3_column_types = [id_column_type] + sqlite3_column_types
-        # Compose the whole columns list.
-        content = ", ".join([f"[{item[0]}] {item[1]}" for item in zip(columns, sqlite3_column_types)])
-        cur.execute(f"CREATE TABLE IF NOT EXISTS {table_name}({content})")
-        cur.execute(f"CREATE INDEX IF NOT EXISTS [{id_column_name}] ON {table_name}([{id_column_name}])")
-    @staticmethod
-    def write_auto(data_it, target, data2col_func, table_name, id_column_name="id",
-                   id_column_type="INTEGER"):
-        """ NOTE: data_it is an iterator of dictionaries.
-            This implementation automatically creates the table and
-        """
-        with sqlite3.connect(target) as con:
-            cur = con.cursor()
-            columns = None
-            for data in data_it:
-                assert(isinstance(data, dict))
-                # Extracting columns from data.
-                row_columns = list(data.keys())
-                assert(id_column_name in row_columns)
-                # Optionally create table.
-                if columns is None:
-                    # Setup list of columns.
-                    columns = row_columns
-                    # Place ID column first.
-                    columns.insert(0, columns.pop(columns.index(id_column_name)))
-                    SQLiteProvider.__create_table(
-                        columns=columns, table_name=table_name, cur=cur,
-                        id_column_name=id_column_name, id_column_type=id_column_type,
-                        sqlite3_column_types=["TEXT"] * len(columns))
-                # Check that each rows satisfies criteria of the first row.
-                [Exception(f"{column} is expected to be in row!") for column in row_columns if column not in columns]
-                uid = data[id_column_name]
-                r = cur.execute(f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE [{id_column_name}]='{uid}');")
-                ans = r.fetchone()[0]
-                if ans == 1:
-                    continue
-                params = ", ".join(tuple(['?'] * (len(columns))))
-                row_columns_str = ", ".join([f"[{col}]" for col in row_columns])
-                cur.execute(f"INSERT INTO {table_name}({row_columns_str}) VALUES ({params})",
-                            [data2col_func(c, data) for c in row_columns])
-                con.commit()
-            cur.close()
-    @staticmethod
-    def iter_rows(target, table="content"):
-        with sqlite3.connect(target) as conn:
-            cursor = conn.cursor()
-            cursor.execute(f"SELECT * FROM {table}")
-            for row in cursor:
-                yield row
-    @staticmethod
-    def get_columns(target, table="content"):
-        with sqlite3.connect(target) as conn:
-            cursor = conn.cursor()
-            cursor.execute(f"PRAGMA table_info({table})")
-            return [row[1] for row in cursor.fetchall()]

bulk_chain/core/service_csv.py DELETED Viewed

@@ -1,57 +0,0 @@
-import csv
-import logging
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-class CsvService:
-    @staticmethod
-    def write(target, lines_it):
-        f = open(target, "w")
-        logger.info(f"Saving: {target}")
-        w = csv.writer(f, delimiter="\t", quotechar='"', quoting=csv.QUOTE_MINIMAL)
-        for content in lines_it:
-            w.writerow(content)
-    @staticmethod
-    def write_handled(target, data_it, data2col_func, header):
-        def __it():
-            yield header
-            for data in data_it:
-                content = data2col_func(data)
-                assert(len(content) == len(header))
-                yield content
-        CsvService.write(target, lines_it=__it())
-    @staticmethod
-    def read(target, skip_header=False, cols=None, as_dict=False, row_id_key=None, **csv_kwargs):
-        assert (isinstance(row_id_key, str) or row_id_key is None)
-        assert (isinstance(cols, list) or cols is None)
-        header = None
-        with open(target, newline='\n') as f:
-            for row_id, row in enumerate(csv.reader(f, **csv_kwargs)):
-                if skip_header and row_id == 0:
-                    header = ([row_id_key] if row_id_key is not None else []) + row
-                    continue
-                # Determine the content we wish to return.
-                if cols is None:
-                    content = row
-                else:
-                    row_d = {header[col_ind]: value for col_ind, value in enumerate(row)}
-                    content = [row_d[col_name] for col_name in cols]
-                content = ([row_id-1] if row_id_key is not None else []) + content
-                # Optionally attach row_id to the content.
-                if as_dict:
-                    assert (header is not None)
-                    assert (len(content) == len(header))
-                    yield {k: v for k, v in zip(header, content)}
-                else:
-                    yield content

bulk_chain-0.24.1.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/infer.py,sha256=hD9GJEp6P9PZRBSUCIxK8DaDjsX-oiq8VCe0rAD2EPs,7366
-bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/core/llm_base.py,sha256=5js2RJLpNS5t-De-xTpZCbLMgbz3F_b9tU_CtXhy02I,259
-bulk_chain/core/provider_sqlite.py,sha256=rNUvBt3aGa6Uv4a9RItyMgBZPnFbBdNjnt0Gw81lM3I,3171
-bulk_chain/core/service_args.py,sha256=Qr3rHsAB8wnajB-DbU-GjiEpRZFP4D6s1lVTpLkPPX4,1294
-bulk_chain/core/service_csv.py,sha256=-m8tNN9aIqRfJa4sPUX8ZUDP4W0fgnnOR3_0PapepDY,1984
-bulk_chain/core/service_data.py,sha256=18gQwSCTEsI7XFukq8AE5lDJX_QQRpasaH69g6EddV0,797
-bulk_chain/core/service_json.py,sha256=alYqTQbBjAcCh7anSTOZs1CLJbiWrLPpzLcoADstD0Q,743
-bulk_chain/core/service_llm.py,sha256=tYgMphJkXunhxdrThdfI4eM8qQTCZfEM1kabbReVjuQ,2726
-bulk_chain/core/service_schema.py,sha256=JVhOv2YP5VEtiwOq_zgCzhS2uF_BOATAgg6fmKRf2NQ,1209
-bulk_chain/core/utils.py,sha256=UV6Cefaw7yZiYblsCr-s9LsbcI83xe7eESBvha9A2Og,2784
-bulk_chain-0.24.1.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
-bulk_chain-0.24.1.dist-info/METADATA,sha256=g5_Sr1pfa8v5lRs0sd7Ldch-uLiV_KfdDXaTHSen-R4,3649
-bulk_chain-0.24.1.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
-bulk_chain-0.24.1.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
-bulk_chain-0.24.1.dist-info/RECORD,,

{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{bulk_chain-0.24.1.dist-info → bulk_chain-0.25.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

bulk-chain 0.24.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

bulk-chain 0.24.1py3-none-any.whl → 0.25.0py3-none-any.whl