bulk-chain 0.25.0__tar.gz → 0.25.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/PKG-INFO +34 -12
  2. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/README.md +31 -11
  3. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/api.py +22 -2
  4. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/service_args.py +6 -1
  5. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/service_llm.py +36 -15
  6. bulk_chain-0.25.1/bulk_chain/core/utils_logger.py +41 -0
  7. bulk_chain-0.25.1/bulk_chain/demo.py +85 -0
  8. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/infer.py +5 -34
  9. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain.egg-info/PKG-INFO +34 -12
  10. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain.egg-info/SOURCES.txt +6 -1
  11. bulk_chain-0.25.1/bulk_chain.egg-info/requires.txt +2 -0
  12. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/setup.py +1 -1
  13. bulk_chain-0.25.1/test/test.py +62 -0
  14. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/test/test_api.py +5 -4
  15. bulk_chain-0.25.1/test/test_cmdargs.py +29 -0
  16. bulk_chain-0.25.1/test/test_provider_batching.py +31 -0
  17. bulk_chain-0.25.0/test/test_cmdargs.py +0 -20
  18. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/LICENSE +0 -0
  19. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/__init__.py +0 -0
  20. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/__init__.py +0 -0
  21. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/llm_base.py +0 -0
  22. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/service_batch.py +0 -0
  23. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/service_data.py +0 -0
  24. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/service_dict.py +0 -0
  25. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/service_json.py +0 -0
  26. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/service_schema.py +0 -0
  27. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain/core/utils.py +0 -0
  28. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain.egg-info/dependency_links.txt +0 -0
  29. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/bulk_chain.egg-info/top_level.txt +0 -0
  30. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/setup.cfg +0 -0
  31. {bulk_chain-0.25.0 → bulk_chain-0.25.1}/test/test_args_seeking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 0.25.0
3
+ Version: 0.25.1
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -15,8 +15,10 @@ Classifier: Topic :: Text Processing :: Linguistic
15
15
  Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: source-iter==0.24.3
18
20
 
19
- # bulk-chain 0.25.0
21
+ # bulk-chain 0.25.1
20
22
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
21
23
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
22
24
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -26,7 +28,11 @@ License-File: LICENSE
26
28
  <img src="logo.png"/>
27
29
  </p>
28
30
 
29
- A lightweight, no-strings-attached **framework** for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
31
+ <p align="center">
32
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
33
+ </p>
34
+
35
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
30
36
 
31
37
  ### Main Features
32
38
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
@@ -42,7 +48,7 @@ A lightweight, no-strings-attached **framework** for your LLM that allows apply
42
48
  From PyPI:
43
49
 
44
50
  ```bash
45
- pip install bulk-chain
51
+ pip install --no-deps bulk-chain
46
52
  ```
47
53
 
48
54
  or latest version from here:
@@ -73,14 +79,12 @@ Below, is an example on how to declare your own schema:
73
79
  }
74
80
  ```
75
81
 
76
- Another templates are available [here](/ext/schema/).
77
-
78
82
  # Usage
79
83
 
80
84
  Preliminary steps:
81
85
 
82
86
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
83
- 2. Wrap or pick **LLM model** from the [list of presets](/ext/).
87
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
84
88
 
85
89
  ## API
86
90
 
@@ -88,16 +92,34 @@ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/b
88
92
 
89
93
  ## Shell
90
94
 
91
- > **NOTE:** You have to install `source-iter` package
95
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
92
96
 
97
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
98
+ ```bash
99
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
100
+ ```
101
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
102
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
93
103
  ```bash
94
104
  python3 -m bulk_chain.infer \
95
105
  --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
96
- --schema "ext/schema/default.json" \
97
- --adapter "dynamic:ext/replicate.py:Replicate" \
106
+ --schema "test/schema/default.json" \
107
+ --adapter "replicate_104.py:Replicate" \
108
+ %%m \
109
+ --model_name "deepseek-ai/deepseek-r1" \
110
+ --api_token "<REPLICATE-API-TOKEN>"
111
+ ```
112
+
113
+ Or, you can launch **demo mode** to interact with LLM via command line:
114
+ > **NOTE:** Demo supports streaming!
115
+ ```bash
116
+ python3 -m bulk_chain.demo \
117
+ --schema "test/schema/thor_cot_schema.json" \
118
+ --adapter "dynamic:replicate_104.py:Replicate" \
98
119
  %%m \
120
+ --model_name "meta/meta-llama-3-70b-instruct" \
99
121
  --api_token "<REPLICATE-API-TOKEN>" \
100
- --temp 0.1
122
+ --stream
101
123
  ```
102
124
 
103
125
  # Embed your LLM
@@ -106,4 +128,4 @@ All you have to do is to implement `BaseLM` class, that includes:
106
128
  * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
107
129
  * `ask(prompt)` -- infer your model with the given `prompt`.
108
130
 
109
- See examples with models [here](/ext).
131
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -1,4 +1,4 @@
1
- # bulk-chain 0.25.0
1
+ # bulk-chain 0.25.1
2
2
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
3
3
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
4
4
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -8,7 +8,11 @@
8
8
  <img src="logo.png"/>
9
9
  </p>
10
10
 
11
- A lightweight, no-strings-attached **framework** for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
11
+ <p align="center">
12
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
13
+ </p>
14
+
15
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
12
16
 
13
17
  ### Main Features
14
18
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
@@ -24,7 +28,7 @@ A lightweight, no-strings-attached **framework** for your LLM that allows apply
24
28
  From PyPI:
25
29
 
26
30
  ```bash
27
- pip install bulk-chain
31
+ pip install --no-deps bulk-chain
28
32
  ```
29
33
 
30
34
  or latest version from here:
@@ -55,14 +59,12 @@ Below, is an example on how to declare your own schema:
55
59
  }
56
60
  ```
57
61
 
58
- Another templates are available [here](/ext/schema/).
59
-
60
62
  # Usage
61
63
 
62
64
  Preliminary steps:
63
65
 
64
66
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
65
- 2. Wrap or pick **LLM model** from the [list of presets](/ext/).
67
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
66
68
 
67
69
  ## API
68
70
 
@@ -70,16 +72,34 @@ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/b
70
72
 
71
73
  ## Shell
72
74
 
73
- > **NOTE:** You have to install `source-iter` package
75
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
74
76
 
77
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
78
+ ```bash
79
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
80
+ ```
81
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
82
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
75
83
  ```bash
76
84
  python3 -m bulk_chain.infer \
77
85
  --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
78
- --schema "ext/schema/default.json" \
79
- --adapter "dynamic:ext/replicate.py:Replicate" \
86
+ --schema "test/schema/default.json" \
87
+ --adapter "replicate_104.py:Replicate" \
88
+ %%m \
89
+ --model_name "deepseek-ai/deepseek-r1" \
90
+ --api_token "<REPLICATE-API-TOKEN>"
91
+ ```
92
+
93
+ Or, you can launch **demo mode** to interact with LLM via command line:
94
+ > **NOTE:** Demo supports streaming!
95
+ ```bash
96
+ python3 -m bulk_chain.demo \
97
+ --schema "test/schema/thor_cot_schema.json" \
98
+ --adapter "dynamic:replicate_104.py:Replicate" \
80
99
  %%m \
100
+ --model_name "meta/meta-llama-3-70b-instruct" \
81
101
  --api_token "<REPLICATE-API-TOKEN>" \
82
- --temp 0.1
102
+ --stream
83
103
  ```
84
104
 
85
105
  # Embed your LLM
@@ -88,4 +108,4 @@ All you have to do is to implement `BaseLM` class, that includes:
88
108
  * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
89
109
  * `ask(prompt)` -- infer your model with the given `prompt`.
90
110
 
91
- See examples with models [here](/ext).
111
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -7,7 +7,7 @@ from bulk_chain.core.service_data import DataService
7
7
  from bulk_chain.core.service_dict import DictionaryService
8
8
  from bulk_chain.core.service_json import JsonService
9
9
  from bulk_chain.core.service_schema import SchemaService
10
-
10
+ from bulk_chain.core.utils import dynamic_init, find_by_prefix
11
11
 
12
12
  INFER_MODES = {
13
13
  "default": lambda llm, prompt, limit_prompt=None: llm.ask_core(
@@ -76,4 +76,24 @@ def iter_content(input_dicts_it, llm, schema, batch_size=1, return_batch=True, l
76
76
  schema=schema)
77
77
  for batch in BatchIterator(prompts_it, batch_size=batch_size))
78
78
 
79
- yield from content_it if return_batch else chain.from_iterable(content_it)
79
+ yield from content_it if return_batch else chain.from_iterable(content_it)
80
+
81
+
82
+ def init_llm(adapter, **model_kwargs):
83
+ """ This method perform dynamic initialization of LLM from third-party resource.
84
+ """
85
+
86
+ # List of the Supported models and their API wrappers.
87
+ models_preset = {
88
+ "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
89
+ class_name=llm_model_params)(**model_kwargs)
90
+ }
91
+
92
+ # Initialize LLM model.
93
+ params = adapter.split(':')
94
+ llm_model_type = params[0]
95
+ llm_model_name = params[1] if len(params) > 1 else params[-1]
96
+ llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
97
+ llm = find_by_prefix(d=models_preset, key=llm_model_type)()
98
+
99
+ return llm, llm_model_name
@@ -12,6 +12,11 @@ class CmdArgsService:
12
12
  def iter_arguments(lst):
13
13
 
14
14
  def __release():
15
+
16
+ # We use the True value by default to treat the related parameter as flag.
17
+ if len(buf) == 0:
18
+ buf.append(True)
19
+
15
20
  return key, buf if len(buf) > 1 else buf[0]
16
21
 
17
22
  key = None
@@ -29,7 +34,7 @@ class CmdArgsService:
29
34
  buf.append(a)
30
35
 
31
36
  # Sharing the remaining params.
32
- if len(buf) > 0:
37
+ if key is not None:
33
38
  yield __release()
34
39
 
35
40
  @staticmethod
@@ -1,8 +1,7 @@
1
- import logging
2
-
3
1
  from bulk_chain.core.llm_base import BaseLM
4
2
  from bulk_chain.core.service_data import DataService
5
3
  from bulk_chain.core.utils import iter_params
4
+ from bulk_chain.core.utils_logger import StreamedLogger
6
5
 
7
6
 
8
7
  def pad_str(text, pad):
@@ -23,29 +22,32 @@ def nice_output(text, width, pad=4, remove_new_line=False):
23
22
  return text_wrap(content=short_text, width=width, handle_line=lambda line: pad_str(line, pad=pad))
24
23
 
25
24
 
26
- def chat_with_lm(lm, chain=None, model_name=None):
25
+ def chat_with_lm(lm, preset_dict=None, chain=None, model_name=None, line_width=80, pad=0):
27
26
  assert (isinstance(lm, BaseLM))
28
27
  assert (isinstance(chain, list))
29
28
  assert (isinstance(model_name, str) or model_name is None)
30
29
 
31
- logger = logging.getLogger(__name__)
32
- logging.basicConfig(level=logging.INFO)
30
+ preset_dict = {} if preset_dict is None else preset_dict
31
+
32
+ streamed_logger = StreamedLogger(__name__)
33
33
 
34
34
  do_exit = False
35
35
  model_name = model_name if model_name is not None else "agent"
36
36
 
37
37
  while not do_exit:
38
38
 
39
- logger.info("----------------")
39
+ streamed_logger.info("----------------")
40
+ streamed_logger.info("\n")
40
41
 
41
42
  # Launching the CoT engine loop.
42
- data_dict = {}
43
- for prompt_args in chain:
43
+ data_dict = {} | preset_dict
44
+ for chain_ind, prompt_args in enumerate(chain):
44
45
 
45
46
  # Processing the prompt.
46
47
  prompt = prompt_args["prompt"]
47
48
 
48
49
  # Filling necessary parameters.
50
+ user_informed = False
49
51
  field_names = list(iter_params(prompt))
50
52
  for ind, f_name in enumerate(field_names):
51
53
 
@@ -54,6 +56,7 @@ def chat_with_lm(lm, chain=None, model_name=None):
54
56
 
55
57
  user_input = input(f"Enter your prompt for `{f_name}` ({ind+1}/{len(field_names)}) "
56
58
  f"(or 'exit' to quit): ")
59
+ user_informed = True
57
60
 
58
61
  if user_input.lower() == 'exit':
59
62
  do_exit = True
@@ -64,19 +67,37 @@ def chat_with_lm(lm, chain=None, model_name=None):
64
67
  if do_exit:
65
68
  break
66
69
 
70
+ # In the case of the initial interaction with the chain.
71
+ # we make sure that aware user for starting interaction.
72
+ if chain_ind == 0 and not user_informed:
73
+ user_input = input(f"Enter to continue (or 'exit' to quit) ...")
74
+ if user_input.lower() == 'exit':
75
+ do_exit = True
76
+
67
77
  # Finally asking LLM.
68
78
  DataService.compose_prompt_text(prompt=prompt, data_dict=data_dict, field_names=field_names)
69
79
  actual_prompt = DataService.get_prompt_text(prompt=prompt, data_dict=data_dict)
70
80
 
71
81
  # Returning meta information, passed to LLM.
72
- pad = 4
73
- logger.info(pad_str(f"{model_name} (ask) ->", pad=pad))
74
- logger.info(nice_output(actual_prompt, pad=pad*2, remove_new_line=True, width=80))
82
+ streamed_logger.info(pad_str(f"{model_name} (ask [{chain_ind+1}/{len(chain)}]) ->", pad=pad))
83
+ streamed_logger.info("\n")
84
+ streamed_logger.info(nice_output(actual_prompt, pad=pad, remove_new_line=True, width=line_width))
85
+ streamed_logger.info("\n\n")
75
86
 
76
87
  # Response.
77
- response_batch = lm.ask_core(batch=[actual_prompt])
78
- logger.info(pad_str(f"{model_name} (resp)->", pad=pad))
79
- logger.info(nice_output(response_batch[0], pad=pad * 2, remove_new_line=False, width=80))
88
+ response = lm.ask_core(batch=[actual_prompt])[0]
89
+ streamed_logger.info(pad_str(f"{model_name} (resp [{chain_ind+1}/{len(chain)}])->", pad=pad))
90
+ streamed_logger.info("\n")
91
+ if isinstance(response, str):
92
+ streamed_logger.info(nice_output(response, pad=pad, remove_new_line=False, width=line_width))
93
+ buffer = [response]
94
+ else:
95
+ buffer = []
96
+ for chunk in response:
97
+ streamed_logger.info(chunk)
98
+ buffer.append(str(chunk))
99
+
100
+ streamed_logger.info("\n\n")
80
101
 
81
102
  # Collecting the answer for the next turn.
82
- data_dict[prompt_args["out"]] = response_batch[0]
103
+ data_dict[prompt_args["out"]] = "".join(buffer)
@@ -0,0 +1,41 @@
1
+ import logging
2
+
3
+
4
+ def StreamedLogger(name: str) -> logging.Logger:
5
+ """ https://medium.com/@r.das699/optimizing-logging-practices-for-streaming-data-in-python-521798e1ed82
6
+ """
7
+ root_handlers = logging.getLogger().handlers
8
+ current_logger = logging.getLogger(name)
9
+ if not root_handlers:
10
+ new_handler = logging.StreamHandler()
11
+ new_handler.terminator = ""
12
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
13
+ current_logger.addHandler(new_handler)
14
+ current_logger.propagate = False
15
+ current_logger.setLevel(logging.INFO)
16
+ return current_logger
17
+
18
+ for handler in current_logger.handlers[:]:
19
+ current_logger.removeHandler(handler)
20
+
21
+ for handler_r in root_handlers:
22
+ if type(handler_r) is logging.StreamHandler:
23
+ new_handler = logging.StreamHandler()
24
+ new_handler.terminator = ""
25
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
26
+ current_logger.addHandler(new_handler)
27
+ elif type(handler_r) is logging.FileHandler:
28
+ new_handler = logging.FileHandler(
29
+ handler_r.baseFilename,
30
+ handler_r.mode,
31
+ handler_r.encoding,
32
+ handler_r.delay,
33
+ handler_r.errors,
34
+ )
35
+ new_handler.terminator = "" # This will stop the printing in new line
36
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
37
+ current_logger.addHandler(new_handler)
38
+ else:
39
+ continue
40
+ current_logger.propagate = False # Don't propagate to root logger
41
+ return current_logger
@@ -0,0 +1,85 @@
1
+ import json
2
+
3
+ import argparse
4
+ import logging
5
+ import sys
6
+
7
+ from source_iter.service_jsonl import JsonlService
8
+
9
+ from bulk_chain.api import init_llm
10
+ from bulk_chain.core.service_args import CmdArgsService
11
+ from bulk_chain.core.service_json import JsonService
12
+ from bulk_chain.core.service_llm import chat_with_lm
13
+ from bulk_chain.core.service_schema import SchemaService
14
+ from bulk_chain.core.utils import parse_filepath
15
+
16
+ logger = logging.getLogger(__name__)
17
+ logging.basicConfig(level=logging.INFO)
18
+
19
+
20
+ def iter_from_json(filepath):
21
+ with open(filepath, "r") as f:
22
+ content = json.load(f)
23
+ for key, value in content.items():
24
+ yield key, value
25
+
26
+
27
+ def iter_from_text_file(filepath):
28
+ with open(filepath, "r") as f:
29
+ yield filepath.split('.')[0], f.read()
30
+
31
+
32
+ if __name__ == '__main__':
33
+
34
+ parser = argparse.ArgumentParser(description="LLM demo usage based on CoT schema")
35
+ parser.add_argument('--adapter', dest='adapter', type=str, default=None)
36
+ parser.add_argument('--attempts', dest='attempts', type=int, default=None)
37
+ parser.add_argument('--src', dest='src', type=str, nargs="*", default=None)
38
+ parser.add_argument('--schema', dest='schema', type=str, default=None,
39
+ help="Path to the JSON file that describes schema")
40
+ parser.add_argument('--limit-prompt', dest="limit_prompt", type=int, default=None,
41
+ help="Optional trimming prompt by the specified amount of characters.")
42
+
43
+ # Extract native arguments.
44
+ native_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
45
+ args = parser.parse_args(args=native_args[1:])
46
+
47
+ # Extract model-related arguments and Initialize Large Language Model.
48
+ model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
49
+ model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
50
+ llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
51
+
52
+ # Setup schema.
53
+ schema = SchemaService(json_data=JsonService.read(args.schema))
54
+ schema_name = schema.src.get("name", None)
55
+ if schema is not None:
56
+ logger.info(f"Using schema: {schema_name}")
57
+
58
+ output_providers = {
59
+ "jsonl": lambda filepath, data_it, header:
60
+ JsonlService.write(target=filepath,
61
+ data_it=map(lambda item: {key: item[i] for i, key in enumerate(header)}, data_it))
62
+ }
63
+
64
+ input_file_handlers = {
65
+ "json": lambda filepath: iter_from_json(filepath),
66
+ "txt": lambda filepath: iter_from_text_file(filepath)
67
+ }
68
+
69
+ # Input extension type defines the provider.
70
+ if args.src is None:
71
+ args.src = []
72
+ if isinstance(args.src, str):
73
+ args.src = [args.src]
74
+ sources = [parse_filepath(s) for s in args.src]
75
+
76
+ preset_dict = {}
77
+ for fp, ext, _ in sources:
78
+ for key, value in input_file_handlers[ext](fp):
79
+ if key in preset_dict:
80
+ raise Exception(f"While at handling {fp}: Key {key} is already registered!")
81
+ preset_dict[key] = value
82
+
83
+ # Launch Demo.
84
+ chat_with_lm(llm, preset_dict=preset_dict, chain=schema.chain, model_name=llm_model_name,
85
+ line_width=120)
@@ -9,51 +9,28 @@ from source_iter.service_jsonl import JsonlService
9
9
  from source_iter.service_sqlite import SQLite3Service
10
10
  from tqdm import tqdm
11
11
 
12
- from bulk_chain.api import INFER_MODES, _infer_batch, CWD
12
+ from bulk_chain.api import INFER_MODES, _infer_batch, CWD, init_llm
13
13
  from bulk_chain.core.llm_base import BaseLM
14
14
  from bulk_chain.core.service_args import CmdArgsService
15
15
  from bulk_chain.core.service_dict import DictionaryService
16
16
  from bulk_chain.core.service_json import JsonService
17
- from bulk_chain.core.service_llm import chat_with_lm
18
17
  from bulk_chain.core.service_schema import SchemaService
19
- from bulk_chain.core.utils import dynamic_init, find_by_prefix, handle_table_name, optional_limit_iter, parse_filepath
18
+ from bulk_chain.core.utils import handle_table_name, optional_limit_iter, parse_filepath
20
19
 
21
20
  logger = logging.getLogger(__name__)
22
21
  logging.basicConfig(level=logging.INFO)
23
22
 
24
-
25
23
  WRITER_PROVIDERS = {
26
24
  "sqlite": lambda filepath, table_name, data_it, infer_data_func, **kwargs: SQLite3Service.write(
27
25
  data_it=data_it, target=filepath, table_name=table_name, data2col_func=infer_data_func,
28
26
  skip_existed=True, **kwargs)
29
27
  }
30
28
 
31
-
32
29
  READER_PROVIDERS = {
33
30
  "sqlite": lambda filepath, table_name: SQLite3Service.read(filepath, table=table_name)
34
31
  }
35
32
 
36
33
 
37
- def init_llm(**model_kwargs):
38
- """ This method perform dynamic initialization of LLM from third-party resource.
39
- """
40
-
41
- # List of the Supported models and their API wrappers.
42
- models_preset = {
43
- "dynamic": lambda: dynamic_init(class_dir=CWD, class_filepath=llm_model_name,
44
- class_name=llm_model_params)(**model_kwargs)
45
- }
46
-
47
- # Initialize LLM model.
48
- params = args.adapter.split(':')
49
- llm_model_type = params[0]
50
- llm_model_name = params[1] if len(params) > 1 else params[-1]
51
- llm_model_params = ':'.join(params[2:]) if len(params) > 2 else None
52
- llm = find_by_prefix(d=models_preset, key=llm_model_type)()
53
-
54
- return llm, llm_model_name
55
-
56
-
57
34
  def iter_content_cached(input_dicts_it, llm, schema, cache_target, limit_prompt=None, **cache_kwargs):
58
35
  assert (isinstance(llm, BaseLM))
59
36
  assert (isinstance(cache_target, str))
@@ -93,7 +70,7 @@ if __name__ == '__main__':
93
70
  parser.add_argument('--adapter', dest='adapter', type=str, default=None)
94
71
  parser.add_argument('--attempts', dest='attempts', type=int, default=None)
95
72
  parser.add_argument('--id-col', dest='id_col', type=str, default="uid")
96
- parser.add_argument('--src', dest='src', type=str, default=None)
73
+ parser.add_argument('--src', dest='src', type=str, nargs="?", default=None)
97
74
  parser.add_argument('--schema', dest='schema', type=str, default=None,
98
75
  help="Path to the JSON file that describes schema")
99
76
  parser.add_argument('--to', dest='to', type=str, default=None, choices=["csv", "sqlite"])
@@ -114,7 +91,7 @@ if __name__ == '__main__':
114
91
  # Extract model-related arguments and Initialize Large Language Model.
115
92
  model_args = CmdArgsService.find_grouped_args(lst=sys.argv, starts_with="%%m", end_prefix="%%")
116
93
  model_args_dict = CmdArgsService.args_to_dict(model_args) | {"attempts": args.attempts}
117
- llm, llm_model_name = init_llm(**model_args_dict)
94
+ llm, llm_model_name = init_llm(adapter=args.adapter, **model_args_dict)
118
95
 
119
96
  # Setup schema.
120
97
  schema = SchemaService(json_data=JsonService.read(args.schema))
@@ -123,7 +100,6 @@ if __name__ == '__main__':
123
100
  logger.info(f"Using schema: {schema_name}")
124
101
 
125
102
  input_providers = {
126
- None: lambda _: chat_with_lm(llm, chain=schema.chain, model_name=llm_model_name),
127
103
  "csv": lambda filepath: CsvService.read(src=filepath, row_id_key=args.id_col,
128
104
  as_dict=True, skip_header=True,
129
105
  delimiter=csv_args_dict.get("delimiter", ","),
@@ -155,14 +131,9 @@ if __name__ == '__main__':
155
131
  args.output = args.output.format(model=llm.name()) if args.output is not None else args.output
156
132
  tgt_filepath, tgt_ext, tgt_meta = parse_filepath(args.output, default_ext=args.to)
157
133
 
158
- # Input extension type defines the provider.
134
+ # We do not support multiple files for other modes.
159
135
  src_filepath, src_ext, src_meta = parse_filepath(args.src)
160
136
 
161
- # Check whether we are in chat mode.
162
- if src_ext is None:
163
- input_providers[src_ext](None)
164
- exit(0)
165
-
166
137
  def default_output_file_template(ext):
167
138
  # This is a default template for output files to be generated.
168
139
  return "".join(["_".join([join(CWD, basename(src_filepath)), llm.name(), schema_name]), ext])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bulk_chain
3
- Version: 0.25.0
3
+ Version: 0.25.1
4
4
  Summary: A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, ensuring reliable results for bulk input requests.
5
5
  Home-page: https://github.com/nicolay-r/bulk-chain
6
6
  Author: Nicolay Rusnachenko
@@ -15,8 +15,10 @@ Classifier: Topic :: Text Processing :: Linguistic
15
15
  Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: source-iter==0.24.3
18
20
 
19
- # bulk-chain 0.25.0
21
+ # bulk-chain 0.25.1
20
22
  ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
21
23
  [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/bulk-chain/blob/master/bulk_chain_tutorial.ipynb)
22
24
  [![twitter](https://img.shields.io/twitter/url/https/shields.io.svg?style=social)](https://x.com/nicolayr_/status/1847969224636961033)
@@ -26,7 +28,11 @@ License-File: LICENSE
26
28
  <img src="logo.png"/>
27
29
  </p>
28
30
 
29
- A lightweight, no-strings-attached **framework** for your LLM that allows applying [Chain-of-Thought](https://arxiv.org/abs/2201.11903) prompt `schema` (See [related section](#chain-of-thought-schema)) towards a massive textual collections.
31
+ <p align="center">
32
+ <a href="https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm"><b>Third-party providers hosting</b>↗️</a>
33
+ </p>
34
+
35
+ A no-strings-attached **framework** for your LLM that allows applying Chain-of-Thought-alike [prompt `schema`](#chain-of-thought-schema) towards a massive textual collections using custom **[third-party providers ↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm)**.
30
36
 
31
37
  ### Main Features
32
38
  * ✅ **No-strings**: you're free to LLM dependencies and flexible `venv` customization.
@@ -42,7 +48,7 @@ A lightweight, no-strings-attached **framework** for your LLM that allows apply
42
48
  From PyPI:
43
49
 
44
50
  ```bash
45
- pip install bulk-chain
51
+ pip install --no-deps bulk-chain
46
52
  ```
47
53
 
48
54
  or latest version from here:
@@ -73,14 +79,12 @@ Below, is an example on how to declare your own schema:
73
79
  }
74
80
  ```
75
81
 
76
- Another templates are available [here](/ext/schema/).
77
-
78
82
  # Usage
79
83
 
80
84
  Preliminary steps:
81
85
 
82
86
  1. Define your [schema](#chain-of-thought-schema) ([Example for Sentiment Analysis](/ext/schema/thor_cot_schema.json)))
83
- 2. Wrap or pick **LLM model** from the [list of presets](/ext/).
87
+ 2. Wrap or pick **LLM model** from the [<b>Third-party providers hosting</b>↗️](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
84
88
 
85
89
  ## API
86
90
 
@@ -88,16 +92,34 @@ Please take a look at the [**related Wiki page**](https://github.com/nicolay-r/b
88
92
 
89
93
  ## Shell
90
94
 
91
- > **NOTE:** You have to install `source-iter` package
95
+ > **NOTE:** You have to install `source-iter` and `tqdm` packages that actual [dependencies](dependencies.txt) of this project
92
96
 
97
+ 1. ⬇️ Download [replicate](https://replicate.com/) provider for `bulk-chain`:
98
+ ```bash
99
+ wget https://raw.githubusercontent.com/nicolay-r/nlp-thirdgate/refs/heads/master/llm/replicate_104.py
100
+ ```
101
+ 2. 📜 Setup your reasoning `schema.json` according to the [following example ↗️](test/schema/default.json)
102
+ 3. 🚀 Launch inference using `DeepSeek-R1`:
93
103
  ```bash
94
104
  python3 -m bulk_chain.infer \
95
105
  --src "<PATH-TO-YOUR-CSV-or-JSONL>" \
96
- --schema "ext/schema/default.json" \
97
- --adapter "dynamic:ext/replicate.py:Replicate" \
106
+ --schema "test/schema/default.json" \
107
+ --adapter "replicate_104.py:Replicate" \
108
+ %%m \
109
+ --model_name "deepseek-ai/deepseek-r1" \
110
+ --api_token "<REPLICATE-API-TOKEN>"
111
+ ```
112
+
113
+ Or, you can launch **demo mode** to interact with LLM via command line:
114
+ > **NOTE:** Demo supports streaming!
115
+ ```bash
116
+ python3 -m bulk_chain.demo \
117
+ --schema "test/schema/thor_cot_schema.json" \
118
+ --adapter "dynamic:replicate_104.py:Replicate" \
98
119
  %%m \
120
+ --model_name "meta/meta-llama-3-70b-instruct" \
99
121
  --api_token "<REPLICATE-API-TOKEN>" \
100
- --temp 0.1
122
+ --stream
101
123
  ```
102
124
 
103
125
  # Embed your LLM
@@ -106,4 +128,4 @@ All you have to do is to implement `BaseLM` class, that includes:
106
128
  * `__init__` -- for setting up *batching mode support* and (optional) *model name*;
107
129
  * `ask(prompt)` -- infer your model with the given `prompt`.
108
130
 
109
- See examples with models [here](/ext).
131
+ See examples with models [at nlp-thirdgate 🌌](https://github.com/nicolay-r/nlp-thirdgate?tab=readme-ov-file#llm).
@@ -3,10 +3,12 @@ README.md
3
3
  setup.py
4
4
  bulk_chain/__init__.py
5
5
  bulk_chain/api.py
6
+ bulk_chain/demo.py
6
7
  bulk_chain/infer.py
7
8
  bulk_chain.egg-info/PKG-INFO
8
9
  bulk_chain.egg-info/SOURCES.txt
9
10
  bulk_chain.egg-info/dependency_links.txt
11
+ bulk_chain.egg-info/requires.txt
10
12
  bulk_chain.egg-info/top_level.txt
11
13
  bulk_chain/core/__init__.py
12
14
  bulk_chain/core/llm_base.py
@@ -18,6 +20,9 @@ bulk_chain/core/service_json.py
18
20
  bulk_chain/core/service_llm.py
19
21
  bulk_chain/core/service_schema.py
20
22
  bulk_chain/core/utils.py
23
+ bulk_chain/core/utils_logger.py
24
+ test/test.py
21
25
  test/test_api.py
22
26
  test/test_args_seeking.py
23
- test/test_cmdargs.py
27
+ test/test_cmdargs.py
28
+ test/test_provider_batching.py
@@ -0,0 +1,2 @@
1
+ tqdm
2
+ source-iter==0.24.3
@@ -15,7 +15,7 @@ def get_requirements(filenames):
15
15
 
16
16
  setup(
17
17
  name='bulk_chain',
18
- version='0.25.0',
18
+ version='0.25.1',
19
19
  python_requires=">=3.6",
20
20
  description='A lightweight, no-strings-attached Chain-of-Thought framework for your LLM, '
21
21
  'ensuring reliable results for bulk input requests.',
@@ -0,0 +1,62 @@
1
+ import random
2
+ import time
3
+ import logging
4
+
5
+
6
+ def setup_logger_behaviour(name: str) -> logging.Logger:
7
+ root_handlers = logging.getLogger().handlers # gets root logger
8
+ current_logger = logging.getLogger(name) # gets current logger
9
+ if not root_handlers: # if root logger has no handlers then create streaming handeler only
10
+ new_handler = logging.StreamHandler()
11
+ new_handler.terminator = ""
12
+ new_handler.setFormatter(logging.Formatter("%(message)s"))
13
+ current_logger.addHandler(new_handler)
14
+ current_logger.propagate = False
15
+ current_logger.setLevel(logging.INFO)
16
+ return current_logger
17
+
18
+ # Remove exixting Handlers from the current logger
19
+ for handler in current_logger.handlers[:]:
20
+ current_logger.removeHandler(handler)
21
+
22
+ for handler_r in root_handlers: # if root logger has handlers
23
+ if type(handler_r) is logging.StreamHandler: # if root logger has streaming handler
24
+ new_handler = logging.StreamHandler()
25
+ new_handler.terminator = "" # This will stop the printing in new line
26
+ new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
27
+ current_logger.addHandler(new_handler)
28
+ elif type(handler_r) is logging.FileHandler: # if root logger has file handler
29
+ new_handler = logging.FileHandler( # create new file handler
30
+ handler_r.baseFilename, # with same filename and other properties
31
+ handler_r.mode,
32
+ handler_r.encoding,
33
+ handler_r.delay,
34
+ handler_r.errors,
35
+ )
36
+ new_handler.terminator = "" # This will stop the printing in new line
37
+ new_handler.setFormatter(logging.Formatter("%(message)s")) # This will set the format
38
+ current_logger.addHandler(new_handler)
39
+ else:
40
+ continue
41
+ current_logger.propagate = False # Don't propagate to root logger
42
+ return current_logger
43
+
44
+ # Configure the logger
45
+ logger =logging.getLogger(__name__)
46
+ class FakeStreamingDataGenerator:
47
+
48
+ def stream_data(self):
49
+ while True:
50
+ data = random.randint(0, 100)
51
+ yield data
52
+ time.sleep(0.5)
53
+
54
+ # Example usage:
55
+ generator = FakeStreamingDataGenerator()
56
+ stream = generator.stream_data()
57
+
58
+ logger = setup_logger_behaviour(__name__) # call you set up function here
59
+ while True:
60
+ chunk = next(stream)
61
+ # Replacing print with logger
62
+ logger.info(chunk) # Best practice now
@@ -9,8 +9,9 @@ from bulk_chain.infer import iter_content_cached
9
9
  class TestAPI(unittest.TestCase):
10
10
 
11
11
  llm = dynamic_init(class_dir=join(CWD, ".."),
12
- class_filepath="ext/replicate.py",
13
- class_name="Replicate")(api_token="<API-KEY>")
12
+ class_filepath="providers/replicate_104.py",
13
+ class_name="Replicate")(api_token="<API-KEY>",
14
+ model_name="deepseek-ai/deepseek-r1")
14
15
 
15
16
  def it_data(self, n):
16
17
  for i in range(n):
@@ -19,7 +20,7 @@ class TestAPI(unittest.TestCase):
19
20
  def test_iter_cached(self):
20
21
  data_it = iter_content_cached(input_dicts_it=self.it_data(20),
21
22
  llm=self.llm,
22
- schema="../ext/schema/default.json",
23
+ schema="../schema/default.json",
23
24
  # Cache-related extra parameters.
24
25
  cache_target="out.sqlite:content",
25
26
  id_column_name="ind")
@@ -32,7 +33,7 @@ class TestAPI(unittest.TestCase):
32
33
  llm=self.llm,
33
34
  batch_size=1,
34
35
  return_batch=True,
35
- schema="../ext/schema/default.json")
36
+ schema="../schema/default.json")
36
37
 
37
38
  for data in data_it:
38
39
  print(data)
@@ -0,0 +1,29 @@
1
+ import sys
2
+ import unittest
3
+
4
+ from bulk_chain.core.service_args import CmdArgsService
5
+
6
+
7
+ class TestCmdArgs(unittest.TestCase):
8
+
9
+ def test(self):
10
+
11
+ # Csv-related.
12
+ csv_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%csv", end_prefix="%%")
13
+ print(csv_args)
14
+ csv_args = CmdArgsService.args_to_dict(csv_args)
15
+ print("csv\t", csv_args)
16
+
17
+ # Model-related.
18
+ m_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%m", end_prefix="%%")
19
+ m_args = CmdArgsService.args_to_dict(m_args)
20
+ print("mod\t", m_args)
21
+
22
+ # native.
23
+ n_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
24
+ n_args = CmdArgsService.args_to_dict(n_args)
25
+ print("nat\t", n_args)
26
+
27
+
28
+ if __name__ == '__main__':
29
+ unittest.main()
@@ -0,0 +1,31 @@
1
+ import unittest
2
+ from os.path import join
3
+
4
+ from tqdm import tqdm
5
+
6
+ from bulk_chain.api import CWD, iter_content
7
+ from bulk_chain.core.utils import dynamic_init
8
+ from utils import iter_test_jsonl_samples
9
+
10
+
11
+ class TestProviderBatching(unittest.TestCase):
12
+
13
+ llm = dynamic_init(class_dir=join(CWD, ".."),
14
+ class_filepath="providers/transformers_flan_t5.py",
15
+ class_name="FlanT5")(model_name="nicolay-r/flan-t5-tsa-thor-base",
16
+ max_new_tokens=128)
17
+
18
+ def test_iter(self):
19
+ input_dicts_it = iter_test_jsonl_samples()
20
+ data_it = iter_content(input_dicts_it=input_dicts_it,
21
+ llm=self.llm,
22
+ batch_size=20,
23
+ return_batch=False,
24
+ schema="schema/default.json")
25
+
26
+ for item in tqdm(data_it):
27
+ print(item)
28
+
29
+
30
+ if __name__ == '__main__':
31
+ unittest.main()
@@ -1,20 +0,0 @@
1
- import sys
2
-
3
- from bulk_chain.core.service_args import CmdArgsService
4
-
5
-
6
- # Csv-related.
7
- csv_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%csv", end_prefix="%%")
8
- print(csv_args)
9
- csv_args = CmdArgsService.args_to_dict(csv_args)
10
- print("csv\t", csv_args)
11
-
12
- # Model-related.
13
- m_args = CmdArgsService.find_grouped_args(sys.argv, starts_with="%%m", end_prefix="%%")
14
- m_args = CmdArgsService.args_to_dict(m_args)
15
- print("mod\t", m_args)
16
-
17
- # native.
18
- n_args = CmdArgsService.extract_native_args(sys.argv, end_prefix="%%")
19
- n_args = CmdArgsService.args_to_dict(n_args)
20
- print("nat\t", n_args)
File without changes
File without changes