PyPI - bulk-chain - Versions diffs - 0.24.0__py3-none-any.whl → 0.24.2__py3-none-any.whl - Mend

bulk-chain 0.24.0py3-none-any.whl → 0.24.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

bulk_chain/core/llm_base.py +24 -2
bulk_chain/core/service_json.py +2 -18
bulk_chain/core/service_llm.py +7 -7
bulk_chain/infer.py +26 -20
{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/METADATA +9 -3
bulk_chain-0.24.2.dist-info/RECORD +15 -0
bulk_chain/core/provider_sqlite.py +0 -79
bulk_chain/core/service_csv.py +0 -57
bulk_chain-0.24.0.dist-info/RECORD +0 -17
{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/LICENSE +0 -0
{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/WHEEL +0 -0
{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/top_level.txt +0 -0

bulk_chain/core/llm_base.py CHANGED Viewed

@@ -1,13 +1,35 @@
+import logging
+import time
 from bulk_chain.core.utils import format_model_name
 class BaseLM(object):
-    def __init__(self, name):
+    def __init__(self, name, attempts=None, delay_sec=1, enable_log=True, **kwargs):
         self.__name = name
+        self.__attempts = 1 if attempts is None else attempts
+        self.__delay_sec = delay_sec
+        if enable_log:
+            self.__logger = logging.getLogger(__name__)
+            logging.basicConfig(level=logging.INFO)
+    def ask_safe(self, prompt):
+        for i in range(self.__attempts):
+            try:
+                response = self.ask(prompt)
+                return response
+            except:
+                if self.__logger is not None:
+                    self.__logger.info("Unable to infer the result. Try {} out of {}.".format(i, self.__attempts))
+                time.sleep(self.__delay_sec)
+        raise Exception("Can't infer")
     def ask(self, prompt):
         raise NotImplemented()
     def name(self):
-        return format_model_name(self.__name)
+        return format_model_name(self.__name)

bulk_chain/core/service_json.py CHANGED Viewed

@@ -4,23 +4,7 @@ import json
 class JsonService(object):
     @staticmethod
-    def read_data(src):
+    def read(src):
         assert (isinstance(src, str))
         with open(src, "r") as f:
-            return json.load(f)
-    @staticmethod
-    def read_lines(src, row_id_key=None):
-        assert (isinstance(src, str))
-        with open(src, "r") as f:
-            for line_ind, line in enumerate(f.readlines()):
-                content = json.loads(line)
-                if row_id_key is not None:
-                    content[row_id_key] = line_ind
-                yield content
-    @staticmethod
-    def write_lines(target, data_it):
-        with open(target, "w") as f:
-            for item in data_it:
-                f.write(f"{json.dumps(item, ensure_ascii=False)}\n")
+            return json.load(f)

bulk_chain/core/service_llm.py CHANGED Viewed

@@ -4,9 +4,6 @@ from bulk_chain.core.llm_base import BaseLM
 from bulk_chain.core.service_data import DataService
 from bulk_chain.core.utils import iter_params
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
 def pad_str(text, pad):
     return text.rjust(len(text) + pad, ' ')
@@ -27,9 +24,12 @@ def nice_output(text, width, pad=4, remove_new_line=False):
 def chat_with_lm(lm, chain=None, model_name=None):
-    assert(isinstance(lm, BaseLM))
-    assert(isinstance(chain, list))
-    assert(isinstance(model_name, str) or model_name is None)
+    assert (isinstance(lm, BaseLM))
+    assert (isinstance(chain, list))
+    assert (isinstance(model_name, str) or model_name is None)
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(level=logging.INFO)
     do_exit = False
     model_name = model_name if model_name is not None else "agent"
@@ -74,7 +74,7 @@ def chat_with_lm(lm, chain=None, model_name=None):
             logger.info(nice_output(actual_prompt, pad=pad*2, remove_new_line=True, width=80))
             # Response.
-            response = lm.ask(actual_prompt)
+            response = lm.ask_safe(actual_prompt)
             logger.info(pad_str(f"{model_name} (resp)->", pad=pad))
             logger.info(nice_output(response, pad=pad*2, remove_new_line=False, width=80))

bulk_chain/infer.py CHANGED Viewed

@@ -1,16 +1,18 @@
+import os
+from os.path import join, basename
 import argparse
 import logging
-import os
 import sys
 from tqdm import tqdm
-from os.path import join, basename
+from source_iter.service_csv import CsvService
+from source_iter.service_jsonl import JsonlService
+from source_iter.service_sqlite import SQLite3Service
 from bulk_chain.core.llm_base import BaseLM
-from bulk_chain.core.provider_sqlite import SQLiteProvider
 from bulk_chain.core.service_args import CmdArgsService
-from bulk_chain.core.service_csv import CsvService
 from bulk_chain.core.service_data import DataService
 from bulk_chain.core.service_json import JsonService
 from bulk_chain.core.service_llm import chat_with_lm
@@ -45,10 +47,10 @@ def init_llm(**model_kwargs):
 def init_schema(json_filepath):
-    return SchemaService(json_data=JsonService.read_data(json_filepath))
+    return SchemaService(json_data=JsonService.read(json_filepath))
-def iter_content(input_dicts_iter, llm, schema, cache_target, cache_table):
+def iter_content(input_dicts_iter, llm, schema, cache_target, cache_table, id_column_name):
     """ This method represent Python API aimed at application of `llm` towards
         iterator of input_dicts via cache_target that refers to the SQLite using
         the given `schema`
@@ -59,7 +61,7 @@ def iter_content(input_dicts_iter, llm, schema, cache_target, cache_table):
     assert (isinstance(cache_table, str))
     infer_modes = {
-        "default": lambda prompt: llm.ask(prompt[:args.limit_prompt] if args.limit_prompt is not None else prompt)
+        "default": lambda prompt: llm.ask_safe(prompt[:args.limit_prompt] if args.limit_prompt is not None else prompt)
     }
     def optional_update_data_records(c, data):
@@ -75,11 +77,11 @@ def iter_content(input_dicts_iter, llm, schema, cache_target, cache_table):
         return data[c]
     cache_providers = {
-        "sqlite": lambda filepath, table_name, data_it: SQLiteProvider.write_auto(
+        "sqlite": lambda filepath, table_name, data_it: SQLite3Service.write_missed(
             data_it=data_it, target=filepath,
             data2col_func=optional_update_data_records,
             table_name=handle_table_name(table_name if table_name is not None else "contents"),
-            id_column_name="uid")
+            id_column_name=id_column_name)
     }
     # We optionally wrap into limiter.
@@ -90,18 +92,18 @@ def iter_content(input_dicts_iter, llm, schema, cache_target, cache_table):
     # Provide data caching.
     cache_providers["sqlite"](cache_target, table_name=tgt_meta, data_it=tqdm(queries_it, desc="Iter content"))
-    return SQLiteProvider.read(cache_target, table=cache_table)
+    return SQLite3Service.read(cache_target, table=cache_table)
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Infer Instruct LLM inference based on CoT schema")
     parser.add_argument('--adapter', dest='adapter', type=str, default=None)
+    parser.add_argument('--attempts', dest='attempts', type=int, default=None)
+    parser.add_argument('--id-col', dest='id_col', type=str, default="uid")
     parser.add_argument('--src', dest='src', type=str, default=None)
     parser.add_argument('--schema', dest='schema', type=str, default=None,
                         help="Path to the JSON file that describes schema")
-    parser.add_argument('--csv-sep', dest='csv_sep', type=str, default='\t')
-    parser.add_argument('--csv-escape-char', dest='csv_escape_char', type=str, default=None)
     parser.add_argument('--to', dest='to', type=str, default=None, choices=["csv", "sqlite"])
     parser.add_argument('--output', dest='output', type=str, default=None)
     parser.add_argument('--limit', dest='limit', type=int, default=None,
@@ -114,7 +116,8 @@ if __name__ == '__main__':
     args = parser.parse_args(args=native_args[1:])
     # Initialize Large Language Model.
-    llm, llm_model_name = init_llm(**CmdArgsService.args_to_dict(model_args))
+    model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
+    llm, llm_model_name = init_llm(**model_args_dict)
     # Setup schema.
     schema = init_schema(args.schema)
@@ -123,17 +126,19 @@ if __name__ == '__main__':
     input_providers = {
         None: lambda _: chat_with_lm(llm, chain=schema.chain, model_name=llm_model_name),
-        "csv": lambda filepath: CsvService.read(target=filepath, row_id_key="uid", delimiter=args.csv_sep,
-                                                as_dict=True, skip_header=True, escapechar=args.csv_escape_char),
-        "jsonl": lambda filepath: JsonService.read_lines(src=filepath, row_id_key="uid")
+        "csv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
+                                                as_dict=True, skip_header=True,
+                                                delimiter=model_args_dict.get("delimiter", "\t"),
+                                                escapechar=model_args_dict.get("escapechar", None)),
+        "jsonl": lambda filepath: JsonlService.read(src=filepath, row_id_key=args.id_col)
     }
     output_providers = {
         "csv": lambda filepath, data_it, header:
-        CsvService.write_handled(target=filepath, data_it=data_it, header=header, data2col_func=lambda v: list(v)),
+            CsvService.write(target=filepath, data_it=data_it, header=header, it_type=None),
         "jsonl": lambda filepath, data_it, header:
-        JsonService.write_lines(target=filepath,
-                                data_it=map(lambda item: {key:item[i] for i, key in enumerate(header)}, data_it))
+        JsonlService.write(target=filepath,
+                           data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
     }
     # Setup output.
@@ -156,6 +161,7 @@ if __name__ == '__main__':
     data_it = iter_content(input_dicts_iter=input_providers[src_ext](src_filepath),
                            schema=schema,
                            llm=llm,
+                           id_column_name=args.id_col,
                            cache_target=cache_target,
                            cache_table=cache_table)
@@ -167,4 +173,4 @@ if __name__ == '__main__':
     # Perform output writing process.
     output_providers[tgt_ext](filepath=output_target,
                               data_it=data_it,
-                              header=SQLiteProvider.get_columns(target=cache_target, table=cache_table))
+                              header=SQLite3Service.read_columns(target=cache_target, table=cache_table))

{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bulk_chain
-Version: 0.24.0
+Version: 0.24.2
 Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
 Home-page: https://github.com/nicolay-r/bulk-chain
 Author: Nicolay Rusnachenko
@@ -16,10 +16,16 @@ Requires-Python: >=3.6
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: tqdm
+Requires-Dist: source-iter ==0.24.2
-# bulk-chain
+# bulk-chain 0.24.2
 ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
 [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
+[![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
+<p align="center">
+    <img src="logo.png"/>
+</p>
 A lightweight, no-strings-attached **[Chain-of-Thought](https://arxiv.org/abs/2201.11903) framework** for your LLM, ensuring reliable results for bulk input requests stored in `CSV` / `JSONL` / `sqlite`.
 It allows applying series of prompts formed into `schema` (See [related section](#chain-of-thought-schema))
@@ -33,7 +39,7 @@ It allows applying series of prompts formed into `schema` (See [related section]
 # Installation
 ```bash
-pip install git+https://github.com/nicolay-r/bulk-chain@master
+pip install bulk-chain
 ```
 ## Chain-of-Thought Schema

bulk_chain-0.24.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/infer.py,sha256=QgbR64A1JS8B9oh0_ruynEfdCpoG1rPHVMtk5Z0Ch2U,7476
+bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+bulk_chain/core/llm_base.py,sha256=Cakuuy4jTVPOta5TyEPFTPGvFpJfM6m0dAIAYSu7zFM,1008
+bulk_chain/core/service_args.py,sha256=Qr3rHsAB8wnajB-DbU-GjiEpRZFP4D6s1lVTpLkPPX4,1294
+bulk_chain/core/service_data.py,sha256=18gQwSCTEsI7XFukq8AE5lDJX_QQRpasaH69g6EddV0,797
+bulk_chain/core/service_json.py,sha256=6o1xM_8c9QEjH9Q3qEmJylU9nahfRXhUd5sFF2dGJwo,182
+bulk_chain/core/service_llm.py,sha256=NoD5KHGtXCmN8SlpgH0Z5KCmSxZcnVVfp65vhVRoG84,2742
+bulk_chain/core/service_schema.py,sha256=JVhOv2YP5VEtiwOq_zgCzhS2uF_BOATAgg6fmKRf2NQ,1209
+bulk_chain/core/utils.py,sha256=UV6Cefaw7yZiYblsCr-s9LsbcI83xe7eESBvha9A2Og,2784
+bulk_chain-0.24.2.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
+bulk_chain-0.24.2.dist-info/METADATA,sha256=yEKF0X90AvNw6yq-W7oBdTLiH8KSrCKRHaERhOFLXFA,3685
+bulk_chain-0.24.2.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
+bulk_chain-0.24.2.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
+bulk_chain-0.24.2.dist-info/RECORD,,

bulk_chain/core/provider_sqlite.py DELETED Viewed

@@ -1,79 +0,0 @@
-import sqlite3
-class SQLiteProvider(object):
-    @staticmethod
-    def __create_table(table_name, columns, id_column_name,
-                       id_column_type, sqlite3_column_types, cur):
-        # Provide the ID column.
-        sqlite3_column_types = [id_column_type] + sqlite3_column_types
-        # Compose the whole columns list.
-        content = ", ".join([" ".join(item) for item in zip(columns, sqlite3_column_types)])
-        cur.execute(f"CREATE TABLE IF NOT EXISTS {table_name}({content})")
-        cur.execute(f"CREATE INDEX IF NOT EXISTS i_id ON {table_name}({id_column_name})")
-    @staticmethod
-    def write_auto(data_it, target, data2col_func, table_name, id_column_name="id",
-                   id_column_type="INTEGER"):
-        """ NOTE: data_it is an iterator of dictionaries.
-            This implementation automatically creates the table and
-        """
-        with sqlite3.connect(target) as con:
-            cur = con.cursor()
-            columns = None
-            for data in data_it:
-                assert(isinstance(data, dict))
-                # Extracting columns from data.
-                row_columns = list(data.keys())
-                assert(id_column_name in row_columns)
-                # Optionally create table.
-                if columns is None:
-                    # Setup list of columns.
-                    columns = row_columns
-                    # Place ID column first.
-                    columns.insert(0, columns.pop(columns.index(id_column_name)))
-                    SQLiteProvider.__create_table(
-                        columns=columns, table_name=table_name, cur=cur,
-                        id_column_name=id_column_name, id_column_type=id_column_type,
-                        sqlite3_column_types=["TEXT"] * len(columns))
-                # Check that each rows satisfies criteria of the first row.
-                [Exception(f"{column} is expected to be in row!") for column in row_columns if column not in columns]
-                uid = data[id_column_name]
-                r = cur.execute(f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE {id_column_name}='{uid}');")
-                ans = r.fetchone()[0]
-                if ans == 1:
-                    continue
-                params = ", ".join(tuple(['?'] * (len(columns))))
-                row_columns_str = ", ".join(row_columns)
-                cur.execute(f"INSERT INTO {table_name}({row_columns_str}) VALUES ({params})",
-                            [data2col_func(c, data) for c in row_columns])
-                con.commit()
-            cur.close()
-    @staticmethod
-    def read(target, column_names=None, table="content"):
-        with sqlite3.connect(target) as conn:
-            cursor = conn.cursor()
-            cols = "*" if column_names is None else ",".join(column_names)
-            cursor.execute(f"SELECT {cols} FROM {table}")
-            for row in cursor:
-                yield row
-    @staticmethod
-    def get_columns(target, table="content"):
-        with sqlite3.connect(target) as conn:
-            cursor = conn.cursor()
-            cursor.execute(f"PRAGMA table_info({table})")
-            return [row[1] for row in cursor.fetchall()]

bulk_chain/core/service_csv.py DELETED Viewed

@@ -1,57 +0,0 @@
-import csv
-import logging
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-class CsvService:
-    @staticmethod
-    def write(target, lines_it):
-        f = open(target, "w")
-        logger.info(f"Saving: {target}")
-        w = csv.writer(f, delimiter="\t", quotechar='"', quoting=csv.QUOTE_MINIMAL)
-        for content in lines_it:
-            w.writerow(content)
-    @staticmethod
-    def write_handled(target, data_it, data2col_func, header):
-        def __it():
-            yield header
-            for data in data_it:
-                content = data2col_func(data)
-                assert(len(content) == len(header))
-                yield content
-        CsvService.write(target, lines_it=__it())
-    @staticmethod
-    def read(target, skip_header=False, cols=None, as_dict=False, row_id_key=None, **csv_kwargs):
-        assert (isinstance(row_id_key, str) or row_id_key is None)
-        assert (isinstance(cols, list) or cols is None)
-        header = None
-        with open(target, newline='\n') as f:
-            for row_id, row in enumerate(csv.reader(f, **csv_kwargs)):
-                if skip_header and row_id == 0:
-                    header = ([row_id_key] if row_id_key is not None else []) + row
-                    continue
-                # Determine the content we wish to return.
-                if cols is None:
-                    content = row
-                else:
-                    row_d = {header[col_ind]: value for col_ind, value in enumerate(row)}
-                    content = [row_d[col_name] for col_name in cols]
-                content = ([row_id-1] if row_id_key is not None else []) + content
-                # Optionally attach row_id to the content.
-                if as_dict:
-                    assert (header is not None)
-                    assert (len(content) == len(header))
-                    yield {k: v for k, v in zip(header, content)}
-                else:
-                    yield content

bulk_chain-0.24.0.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-bulk_chain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/infer.py,sha256=HXFcl_7u5sgybDv_v5_up-Mpe-zSX0vtgsG1Wh1h-UA,7184
-bulk_chain/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-bulk_chain/core/llm_base.py,sha256=5js2RJLpNS5t-De-xTpZCbLMgbz3F_b9tU_CtXhy02I,259
-bulk_chain/core/provider_sqlite.py,sha256=D7axdeTDvv-ULHKTalFWbeKC3WaYOLI7lVrXFAXkct8,3213
-bulk_chain/core/service_args.py,sha256=Qr3rHsAB8wnajB-DbU-GjiEpRZFP4D6s1lVTpLkPPX4,1294
-bulk_chain/core/service_csv.py,sha256=-m8tNN9aIqRfJa4sPUX8ZUDP4W0fgnnOR3_0PapepDY,1984
-bulk_chain/core/service_data.py,sha256=18gQwSCTEsI7XFukq8AE5lDJX_QQRpasaH69g6EddV0,797
-bulk_chain/core/service_json.py,sha256=alYqTQbBjAcCh7anSTOZs1CLJbiWrLPpzLcoADstD0Q,743
-bulk_chain/core/service_llm.py,sha256=tYgMphJkXunhxdrThdfI4eM8qQTCZfEM1kabbReVjuQ,2726
-bulk_chain/core/service_schema.py,sha256=JVhOv2YP5VEtiwOq_zgCzhS2uF_BOATAgg6fmKRf2NQ,1209
-bulk_chain/core/utils.py,sha256=UV6Cefaw7yZiYblsCr-s9LsbcI83xe7eESBvha9A2Og,2784
-bulk_chain-0.24.0.dist-info/LICENSE,sha256=VF9SjNpwwSSFEY_eP_8A1ocDCrbwfjI1pZexXdCkOwo,1076
-bulk_chain-0.24.0.dist-info/METADATA,sha256=l_RpSlOGQzuA0buVn7I54XN_c9Fn_5Y6lhNPkqlhYqo,3496
-bulk_chain-0.24.0.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
-bulk_chain-0.24.0.dist-info/top_level.txt,sha256=Hxq_wyH-GDXKBaA63UfBIiMJO2eCHJG5EOrXDphpeB4,11
-bulk_chain-0.24.0.dist-info/RECORD,,

{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{bulk_chain-0.24.0.dist-info → bulk_chain-0.24.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

bulk-chain 0.24.0__py3-none-any.whl → 0.24.2__py3-none-any.whl

bulk-chain 0.24.0py3-none-any.whl → 0.24.2py3-none-any.whl