graphbook 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphbook-0.4.0/LICENSE +21 -0
- graphbook-0.4.0/PKG-INFO +99 -0
- graphbook-0.4.0/README.md +70 -0
- graphbook-0.4.0/graphbook/__init__.py +1 -0
- graphbook-0.4.0/graphbook/custom_nodes.py +122 -0
- graphbook-0.4.0/graphbook/dataloading.py +250 -0
- graphbook-0.4.0/graphbook/exports.py +98 -0
- graphbook-0.4.0/graphbook/media.py +70 -0
- graphbook-0.4.0/graphbook/note.py +40 -0
- graphbook-0.4.0/graphbook/processing.py +245 -0
- graphbook-0.4.0/graphbook/resources/__init__.py +1 -0
- graphbook-0.4.0/graphbook/resources/base.py +39 -0
- graphbook-0.4.0/graphbook/server.py +479 -0
- graphbook-0.4.0/graphbook/state.py +402 -0
- graphbook-0.4.0/graphbook/steps/__init__.py +2 -0
- graphbook-0.4.0/graphbook/steps/arithmetic.py +59 -0
- graphbook-0.4.0/graphbook/steps/base.py +507 -0
- graphbook-0.4.0/graphbook/steps/io.py +48 -0
- graphbook-0.4.0/graphbook/utils.py +117 -0
- graphbook-0.4.0/graphbook/viewer.py +382 -0
- graphbook-0.4.0/graphbook/web/assets/index-DAjp4Dbh.js +1016 -0
- graphbook-0.4.0/graphbook/web/assets/index-UXoXgXuD.css +1 -0
- graphbook-0.4.0/graphbook/web/graphbook.svg +1 -0
- graphbook-0.4.0/graphbook/web/index.html +14 -0
- graphbook-0.4.0/pyproject.toml +55 -0
graphbook-0.4.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Graphbook AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
graphbook-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: graphbook
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: An extensible ML workflow framework built for data scientists and ML engineers.
|
|
5
|
+
Home-page: https://graphbook.ai
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: ml,workflow,framework,pytorch,data science,machine learning,ai
|
|
8
|
+
Author: Richard Franklin
|
|
9
|
+
Author-email: rsamf@graphbook.ai
|
|
10
|
+
Requires-Python: >=3.11,<4.0
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Dist: aiohttp (>=3.9.4,<4.0.0)
|
|
17
|
+
Requires-Dist: dill (>=0.3.8,<0.4.0)
|
|
18
|
+
Requires-Dist: gputil (>=1.4.0,<2.0.0)
|
|
19
|
+
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
20
|
+
Requires-Dist: psutil (>=6.0.0,<7.0.0)
|
|
21
|
+
Requires-Dist: python-magic (>=0.4.27,<0.5.0)
|
|
22
|
+
Requires-Dist: torch (>=2.3.1,<3.0.0)
|
|
23
|
+
Requires-Dist: torchvision (>=0.18.1,<0.19.0)
|
|
24
|
+
Requires-Dist: watchdog (>=4.0.0,<5.0.0)
|
|
25
|
+
Project-URL: Documentation, https://docs.graphbook.ai
|
|
26
|
+
Project-URL: Repository, https://github.com/graphbookai/graphbook
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<a href="https://graphbook.ai">
|
|
31
|
+
<img src="docs/_static/graphbook.png" alt="Logo" width=256>
|
|
32
|
+
</a>
|
|
33
|
+
|
|
34
|
+
<h1 align="center">Graphbook</h1>
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
The ML workflow framework
|
|
38
|
+
<br>
|
|
39
|
+
<a href="https://github.com/graphbookai/graphbook/issues/new?template=bug.md">Report bug</a>
|
|
40
|
+
·
|
|
41
|
+
<a href="https://github.com/graphbookai/graphbook/issues/new?template=feature.md&labels=feature">Request feature</a>
|
|
42
|
+
</p>
|
|
43
|
+
|
|
44
|
+
<p align="center">
|
|
45
|
+
<a href="#overview">Overview</a> •
|
|
46
|
+
<a href="#current-features">Current Features</a> •
|
|
47
|
+
<a href="#getting-started">Getting Started</a> •
|
|
48
|
+
<a href="#collaboration">Collaboration</a>
|
|
49
|
+
</p>
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
## Overview
|
|
53
|
+
Graphbook is a framework for building efficient, visual DAG-structured ML workflows composed of nodes written in Python. Graphbook provides common ML processing features such as multiprocessing IO and automatic batching, and it features a web-based UI to assemble, monitor, and execute data processing workflows. It can be used to prepare training data for custom ML models, experiment with custom trained or off-the-shelf models, and to build ML-based ETL applications. Custom nodes can be built in Python, and Graphbook will behave like a framework and call lifecycle methods on those nodes.
|
|
54
|
+
|
|
55
|
+
## Current Features
|
|
56
|
+
- Graph-based visual editor to experiment and create complex ML workflows
|
|
57
|
+
- Caches outputs and only re-executes parts of the workflow that changes between executions
|
|
58
|
+
- UI monitoring components for logs and outputs per node
|
|
59
|
+
- Custom buildable nodes with Python
|
|
60
|
+
- Automatic batching for Pytorch tensors
|
|
61
|
+
- Multiprocessing I/O to and from disk and network
|
|
62
|
+
- Customizable multiprocessing functions
|
|
63
|
+
- Ability to execute entire graphs, or individual subgraphs/nodes
|
|
64
|
+
- Ability to execute singular batches of data
|
|
65
|
+
- Ability to pause graph execution
|
|
66
|
+
- Basic nodes for filtering, loading, and saving outputs
|
|
67
|
+
- Node grouping and subflows
|
|
68
|
+
- Autosaving and shareable serialized workflow files
|
|
69
|
+
- Registers node code changes without needing a restart
|
|
70
|
+
- Monitorable CPU and GPU resource usage
|
|
71
|
+
|
|
72
|
+
## Getting Started
|
|
73
|
+
### Install from PyPI
|
|
74
|
+
1. `pip install graphbook`
|
|
75
|
+
1. `graphbook`
|
|
76
|
+
1. Visit http://localhost:8007
|
|
77
|
+
|
|
78
|
+
### Install with Docker
|
|
79
|
+
1. Pull and run the downloaded image
|
|
80
|
+
```bash
|
|
81
|
+
docker run --rm -p 8005:8005 -p 8006:8006 -p 8007:8007 -v $PWD/workflows:/app/workflows rsamf/graphbook:latest
|
|
82
|
+
```
|
|
83
|
+
1. Visit http://localhost:8007
|
|
84
|
+
|
|
85
|
+
Visit the [docs](https://docs.graphbook.ai) to learn more on how to create custom nodes and workflows with Graphbook.
|
|
86
|
+
|
|
87
|
+
## Collaboration
|
|
88
|
+
This is a guide on how to get started developing Graphbook. If you are simply using Graphbook, view the [Getting Started](#getting-started) section.
|
|
89
|
+
|
|
90
|
+
### Run Graphbook in Development Mode
|
|
91
|
+
You can use any other virtual environment solution, but `poetry` is used in the steps below.
|
|
92
|
+
1. Clone the repo and `cd graphbook`
|
|
93
|
+
1. `poetry install --with dev`
|
|
94
|
+
1. `poetry shell`
|
|
95
|
+
1. `python graphbook/server.py`
|
|
96
|
+
1. `cd web`
|
|
97
|
+
1. `npm install`
|
|
98
|
+
1. `npm run dev`
|
|
99
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<a href="https://graphbook.ai">
|
|
3
|
+
<img src="docs/_static/graphbook.png" alt="Logo" width=256>
|
|
4
|
+
</a>
|
|
5
|
+
|
|
6
|
+
<h1 align="center">Graphbook</h1>
|
|
7
|
+
|
|
8
|
+
<p align="center">
|
|
9
|
+
The ML workflow framework
|
|
10
|
+
<br>
|
|
11
|
+
<a href="https://github.com/graphbookai/graphbook/issues/new?template=bug.md">Report bug</a>
|
|
12
|
+
·
|
|
13
|
+
<a href="https://github.com/graphbookai/graphbook/issues/new?template=feature.md&labels=feature">Request feature</a>
|
|
14
|
+
</p>
|
|
15
|
+
|
|
16
|
+
<p align="center">
|
|
17
|
+
<a href="#overview">Overview</a> •
|
|
18
|
+
<a href="#current-features">Current Features</a> •
|
|
19
|
+
<a href="#getting-started">Getting Started</a> •
|
|
20
|
+
<a href="#collaboration">Collaboration</a>
|
|
21
|
+
</p>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
## Overview
|
|
25
|
+
Graphbook is a framework for building efficient, visual DAG-structured ML workflows composed of nodes written in Python. Graphbook provides common ML processing features such as multiprocessing IO and automatic batching, and it features a web-based UI to assemble, monitor, and execute data processing workflows. It can be used to prepare training data for custom ML models, experiment with custom trained or off-the-shelf models, and to build ML-based ETL applications. Custom nodes can be built in Python, and Graphbook will behave like a framework and call lifecycle methods on those nodes.
|
|
26
|
+
|
|
27
|
+
## Current Features
|
|
28
|
+
- Graph-based visual editor to experiment and create complex ML workflows
|
|
29
|
+
- Caches outputs and only re-executes parts of the workflow that changes between executions
|
|
30
|
+
- UI monitoring components for logs and outputs per node
|
|
31
|
+
- Custom buildable nodes with Python
|
|
32
|
+
- Automatic batching for Pytorch tensors
|
|
33
|
+
- Multiprocessing I/O to and from disk and network
|
|
34
|
+
- Customizable multiprocessing functions
|
|
35
|
+
- Ability to execute entire graphs, or individual subgraphs/nodes
|
|
36
|
+
- Ability to execute singular batches of data
|
|
37
|
+
- Ability to pause graph execution
|
|
38
|
+
- Basic nodes for filtering, loading, and saving outputs
|
|
39
|
+
- Node grouping and subflows
|
|
40
|
+
- Autosaving and shareable serialized workflow files
|
|
41
|
+
- Registers node code changes without needing a restart
|
|
42
|
+
- Monitorable CPU and GPU resource usage
|
|
43
|
+
|
|
44
|
+
## Getting Started
|
|
45
|
+
### Install from PyPI
|
|
46
|
+
1. `pip install graphbook`
|
|
47
|
+
1. `graphbook`
|
|
48
|
+
1. Visit http://localhost:8007
|
|
49
|
+
|
|
50
|
+
### Install with Docker
|
|
51
|
+
1. Pull and run the downloaded image
|
|
52
|
+
```bash
|
|
53
|
+
docker run --rm -p 8005:8005 -p 8006:8006 -p 8007:8007 -v $PWD/workflows:/app/workflows rsamf/graphbook:latest
|
|
54
|
+
```
|
|
55
|
+
1. Visit http://localhost:8007
|
|
56
|
+
|
|
57
|
+
Visit the [docs](https://docs.graphbook.ai) to learn more on how to create custom nodes and workflows with Graphbook.
|
|
58
|
+
|
|
59
|
+
## Collaboration
|
|
60
|
+
This is a guide on how to get started developing Graphbook. If you are simply using Graphbook, view the [Getting Started](#getting-started) section.
|
|
61
|
+
|
|
62
|
+
### Run Graphbook in Development Mode
|
|
63
|
+
You can use any other virtual environment solution, but `poetry` is used in the steps below.
|
|
64
|
+
1. Clone the repo and `cd graphbook`
|
|
65
|
+
1. `poetry install --with dev`
|
|
66
|
+
1. `poetry shell`
|
|
67
|
+
1. `python graphbook/server.py`
|
|
68
|
+
1. `cd web`
|
|
69
|
+
1. `npm install`
|
|
70
|
+
1. `npm run dev`
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .note import Note
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from watchdog.events import FileSystemEvent, FileSystemEventHandler
|
|
3
|
+
from watchdog.observers import Observer
|
|
4
|
+
import importlib
|
|
5
|
+
import importlib.util
|
|
6
|
+
import hashlib
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import os.path as osp
|
|
10
|
+
import inspect
|
|
11
|
+
from graphbook.steps import Step, BatchStep, SourceStep, AsyncStep, Split, SplitNotesByItems, SplitItemField
|
|
12
|
+
from graphbook.resources import Resource, FunctionResource
|
|
13
|
+
|
|
14
|
+
BUILT_IN_STEPS = [Step, BatchStep, SourceStep, AsyncStep, Split, SplitNotesByItems, SplitItemField]
|
|
15
|
+
BUILT_IN_RESOURCES = [Resource, FunctionResource]
|
|
16
|
+
|
|
17
|
+
class CustomModuleEventHandler(FileSystemEventHandler):
|
|
18
|
+
def __init__(self, root_path, handler):
|
|
19
|
+
super().__init__()
|
|
20
|
+
self.root_path = osp.abspath(root_path)
|
|
21
|
+
self.handler = handler
|
|
22
|
+
self.ha = {}
|
|
23
|
+
|
|
24
|
+
def on_created(self, event):
|
|
25
|
+
if event.is_directory:
|
|
26
|
+
return
|
|
27
|
+
self.handle_new_file_sync(event.src_path)
|
|
28
|
+
|
|
29
|
+
def on_modified(self, event):
|
|
30
|
+
if event.is_directory:
|
|
31
|
+
return
|
|
32
|
+
self.handle_new_file_sync(event.src_path)
|
|
33
|
+
|
|
34
|
+
def on_deleted(self, event):
|
|
35
|
+
if event.is_directory:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
def on_moved(self, event: FileSystemEvent) -> None:
|
|
39
|
+
if event.is_directory:
|
|
40
|
+
return
|
|
41
|
+
self.handle_new_file_sync(event.dest_path)
|
|
42
|
+
|
|
43
|
+
async def handle_new_file(self, filename: str):
|
|
44
|
+
filename = osp.abspath(filename)
|
|
45
|
+
assert filename.startswith(
|
|
46
|
+
self.root_path
|
|
47
|
+
), f"Received extraneous file {filename} during tracking of {self.root_path}"
|
|
48
|
+
if not filename.endswith(".py"):
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
with open(filename, "r") as f:
|
|
52
|
+
contents = f.read()
|
|
53
|
+
|
|
54
|
+
hash_code = hashlib.md5(contents.encode()).hexdigest()
|
|
55
|
+
og_hash_code = self.ha.get(filename, None)
|
|
56
|
+
if hash_code == og_hash_code:
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
self.ha[filename] = hash_code
|
|
60
|
+
filename = filename[len(self.root_path) + 1 :]
|
|
61
|
+
components = filename[: filename.index(".py")].split("/")
|
|
62
|
+
module_name = ".".join(components)
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
if og_hash_code is None:
|
|
66
|
+
importlib.import_module(module_name)
|
|
67
|
+
print("Loaded", module_name)
|
|
68
|
+
else:
|
|
69
|
+
module = importlib.import_module(module_name)
|
|
70
|
+
importlib.reload(module)
|
|
71
|
+
print("Reloaded", module_name)
|
|
72
|
+
except Exception as e:
|
|
73
|
+
print(f"Error loading {module_name}: {e}")
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
module = sys.modules[module_name]
|
|
77
|
+
await self.handler(filename, module)
|
|
78
|
+
|
|
79
|
+
def handle_new_file_sync(self, filename: str):
|
|
80
|
+
asyncio.run(self.handle_new_file(filename))
|
|
81
|
+
|
|
82
|
+
async def init_custom_nodes(self):
|
|
83
|
+
for root, dirs, files in os.walk(self.root_path):
|
|
84
|
+
for file in files:
|
|
85
|
+
await self.handle_new_file(osp.join(root, file))
|
|
86
|
+
|
|
87
|
+
def init_custom_nodes_sync(self):
|
|
88
|
+
asyncio.run(self.init_custom_nodes())
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class CustomNodeImporter:
|
|
92
|
+
def __init__(self, path, step_handler, resource_handler):
|
|
93
|
+
self.websocket = None
|
|
94
|
+
self.path = path
|
|
95
|
+
self.step_handler = step_handler
|
|
96
|
+
self.resource_handler = resource_handler
|
|
97
|
+
sys.path.append(path)
|
|
98
|
+
self.observer = Observer()
|
|
99
|
+
self.event_handler = CustomModuleEventHandler(path, self.on_module)
|
|
100
|
+
self.event_handler.init_custom_nodes_sync()
|
|
101
|
+
|
|
102
|
+
def set_websocket(self, websocket):
|
|
103
|
+
self.websocket = websocket
|
|
104
|
+
|
|
105
|
+
async def on_module(self, filename, mod):
|
|
106
|
+
for name, obj in inspect.getmembers(mod):
|
|
107
|
+
if inspect.isclass(obj):
|
|
108
|
+
if issubclass(obj, Step) and not obj in BUILT_IN_STEPS:
|
|
109
|
+
await self.step_handler(filename, name, obj)
|
|
110
|
+
if issubclass(obj, Resource) and not obj in BUILT_IN_RESOURCES:
|
|
111
|
+
await self.resource_handler(filename, name, obj)
|
|
112
|
+
|
|
113
|
+
if self.websocket is not None and not self.websocket.closed:
|
|
114
|
+
await self.websocket.send_json({"event": "node_updated"})
|
|
115
|
+
|
|
116
|
+
def start_observer(self):
|
|
117
|
+
self.observer.schedule(self.event_handler, self.path, recursive=True)
|
|
118
|
+
self.observer.start()
|
|
119
|
+
|
|
120
|
+
def stop_observer(self):
|
|
121
|
+
self.observer.stop()
|
|
122
|
+
self.observer.join()
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import queue
|
|
3
|
+
import torch.multiprocessing as mp
|
|
4
|
+
|
|
5
|
+
MP_WORKER_TIMEOUT = 5.0
|
|
6
|
+
MAX_RESULT_QUEUE_SIZE = 32
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def do_load(work_queue: mp.Queue, result_queue: mp.Queue):
|
|
10
|
+
if result_queue.full():
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
item, index, record_id, load_fn, consumer_id = work_queue.get(False)
|
|
15
|
+
except queue.Empty:
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
result_tensor = load_fn(item)
|
|
20
|
+
result = (result_tensor, index)
|
|
21
|
+
to_return = ((result, record_id), consumer_id)
|
|
22
|
+
except Exception as e:
|
|
23
|
+
to_return = ((None, record_id), consumer_id)
|
|
24
|
+
print(
|
|
25
|
+
f"Worker Error: Could not process input {item}. The following exception was raised: {e}. Check your load_fn."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
result_queue.put(to_return, block=False)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def do_dump(
|
|
32
|
+
work_queue: mp.Queue, result_queue: mp.Queue, output_dir: str, uid: int
|
|
33
|
+
) -> bool:
|
|
34
|
+
if result_queue.full():
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
data, item_key, record_id, dump_fn, consumer_id = work_queue.get(False)
|
|
39
|
+
except queue.Empty:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
output_fn = dump_fn(data, output_dir, uid)
|
|
44
|
+
result = (item_key, output_fn)
|
|
45
|
+
to_return = ((result, record_id), consumer_id)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"Could not dump input {data}. The following exception was raised: {e}")
|
|
48
|
+
to_return = ((None, record_id), consumer_id)
|
|
49
|
+
|
|
50
|
+
result_queue.put(to_return, block=False)
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def worker_loop(
|
|
55
|
+
rank: int,
|
|
56
|
+
num_processes: int,
|
|
57
|
+
load_queue: mp.Queue,
|
|
58
|
+
dump_queue: mp.Queue,
|
|
59
|
+
load_result_queue: mp.Queue,
|
|
60
|
+
dump_result_queue: mp.Queue,
|
|
61
|
+
dump_dir: str,
|
|
62
|
+
close_event: mp.Event,
|
|
63
|
+
):
|
|
64
|
+
try:
|
|
65
|
+
dump_ctr = rank
|
|
66
|
+
while not close_event.is_set():
|
|
67
|
+
do_load(load_queue, load_result_queue)
|
|
68
|
+
did_receive_work = do_dump(
|
|
69
|
+
dump_queue, dump_result_queue, dump_dir, dump_ctr
|
|
70
|
+
)
|
|
71
|
+
if did_receive_work:
|
|
72
|
+
dump_ctr += num_processes
|
|
73
|
+
except KeyboardInterrupt:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class Dataloader:
|
|
78
|
+
def __init__(self, dump_dir: str, num_workers: int = 1):
|
|
79
|
+
self.dump_dir = dump_dir
|
|
80
|
+
self.num_workers = num_workers
|
|
81
|
+
self.context = mp.get_context("spawn")
|
|
82
|
+
self.consumer_load_queues = {}
|
|
83
|
+
self.consumer_dump_queues = {}
|
|
84
|
+
self.total_consumer_size = 0
|
|
85
|
+
self._start_workers()
|
|
86
|
+
|
|
87
|
+
def _start_workers(self):
|
|
88
|
+
self._workers: List[mp.Process] = []
|
|
89
|
+
self._worker_queue_cycle = 0
|
|
90
|
+
self._load_queues: List[mp.Queue] = []
|
|
91
|
+
self._dump_queues: List[mp.Queue] = []
|
|
92
|
+
self._load_result_queues: List[mp.Queue] = []
|
|
93
|
+
self._dump_result_queues: List[mp.Queue] = []
|
|
94
|
+
self._close_event: mp.Event = self.context.Event()
|
|
95
|
+
for i in range(self.num_workers):
|
|
96
|
+
load_queue = self.context.Queue()
|
|
97
|
+
dump_queue = self.context.Queue()
|
|
98
|
+
load_result_queue = self.context.Queue(maxsize=MAX_RESULT_QUEUE_SIZE)
|
|
99
|
+
dump_result_queue = self.context.Queue(maxsize=MAX_RESULT_QUEUE_SIZE)
|
|
100
|
+
load_queue.cancel_join_thread()
|
|
101
|
+
dump_queue.cancel_join_thread()
|
|
102
|
+
w = self.context.Process(
|
|
103
|
+
target=worker_loop,
|
|
104
|
+
args=(
|
|
105
|
+
i,
|
|
106
|
+
self.num_workers,
|
|
107
|
+
load_queue,
|
|
108
|
+
dump_queue,
|
|
109
|
+
load_result_queue,
|
|
110
|
+
dump_result_queue,
|
|
111
|
+
self.dump_dir,
|
|
112
|
+
self._close_event,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
w.daemon = True
|
|
116
|
+
w.start()
|
|
117
|
+
self._load_queues.append(load_queue)
|
|
118
|
+
self._dump_queues.append(dump_queue)
|
|
119
|
+
self._load_result_queues.append(load_result_queue)
|
|
120
|
+
self._dump_result_queues.append(dump_result_queue)
|
|
121
|
+
self._workers.append(w)
|
|
122
|
+
|
|
123
|
+
def setup(self, consumer_ids: List[int]):
|
|
124
|
+
for c in consumer_ids:
|
|
125
|
+
if c not in self.consumer_load_queues:
|
|
126
|
+
self.consumer_load_queues[c] = queue.Queue()
|
|
127
|
+
if c not in self.consumer_dump_queues:
|
|
128
|
+
self.consumer_dump_queues[c] = queue.Queue()
|
|
129
|
+
unused_ids = set(self.consumer_load_queues.keys()) - set(consumer_ids)
|
|
130
|
+
for c in unused_ids:
|
|
131
|
+
self.total_consumer_size -= self.consumer_load_queues[c].qsize()
|
|
132
|
+
del self.consumer_load_queues[c]
|
|
133
|
+
unused_ids = set(self.consumer_dump_queues.keys()) - set(consumer_ids)
|
|
134
|
+
for c in unused_ids:
|
|
135
|
+
self.total_consumer_size -= self.consumer_dump_queues[c].qsize()
|
|
136
|
+
del self.consumer_dump_queues[c]
|
|
137
|
+
|
|
138
|
+
def shutdown(self):
|
|
139
|
+
if len(self._workers) == 0:
|
|
140
|
+
return
|
|
141
|
+
try:
|
|
142
|
+
self._close_event.set()
|
|
143
|
+
for q in self._load_queues:
|
|
144
|
+
q.cancel_join_thread()
|
|
145
|
+
q.close()
|
|
146
|
+
for q in self._dump_queues:
|
|
147
|
+
q.cancel_join_thread()
|
|
148
|
+
q.close()
|
|
149
|
+
for w in self._workers:
|
|
150
|
+
w.join(timeout=MP_WORKER_TIMEOUT)
|
|
151
|
+
finally:
|
|
152
|
+
for w in self._workers:
|
|
153
|
+
if w.is_alive():
|
|
154
|
+
w.terminate()
|
|
155
|
+
|
|
156
|
+
def _handle_queues(self):
|
|
157
|
+
for queues, consumers in zip(
|
|
158
|
+
[self._load_result_queues, self._dump_result_queues],
|
|
159
|
+
[self.consumer_load_queues, self.consumer_dump_queues],
|
|
160
|
+
):
|
|
161
|
+
for q in queues:
|
|
162
|
+
while (
|
|
163
|
+
not q.empty() and self.total_consumer_size < MAX_RESULT_QUEUE_SIZE
|
|
164
|
+
):
|
|
165
|
+
result, consumer_id = q.get(False)
|
|
166
|
+
if consumer_id not in consumers:
|
|
167
|
+
continue
|
|
168
|
+
consumers[consumer_id].put(result, block=False)
|
|
169
|
+
self.total_consumer_size += 1
|
|
170
|
+
|
|
171
|
+
def get_all_sizes(self):
|
|
172
|
+
return {
|
|
173
|
+
"load": [q.qsize() for q in self._load_queues],
|
|
174
|
+
"dump": [q.qsize() for q in self._dump_queues],
|
|
175
|
+
"load_result": [q.qsize() for q in self._load_result_queues],
|
|
176
|
+
"dump_result": [q.qsize() for q in self._dump_result_queues],
|
|
177
|
+
"total_consumer_size": self.total_consumer_size,
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
def clear(self):
|
|
181
|
+
def clear_queue(q):
|
|
182
|
+
while not q.empty():
|
|
183
|
+
try:
|
|
184
|
+
q.get(False)
|
|
185
|
+
except queue.Empty:
|
|
186
|
+
print("Emptying an empty queue. Is the graph still executing?")
|
|
187
|
+
break
|
|
188
|
+
|
|
189
|
+
for q in self._load_queues:
|
|
190
|
+
clear_queue(q)
|
|
191
|
+
for q in self._dump_queues:
|
|
192
|
+
clear_queue(q)
|
|
193
|
+
for q in self._load_result_queues:
|
|
194
|
+
clear_queue(q)
|
|
195
|
+
for q in self._dump_result_queues:
|
|
196
|
+
clear_queue(q)
|
|
197
|
+
for q in self.consumer_load_queues:
|
|
198
|
+
clear_queue(q)
|
|
199
|
+
for q in self.consumer_dump_queues:
|
|
200
|
+
clear_queue(q)
|
|
201
|
+
self.total_consumer_size = 0
|
|
202
|
+
|
|
203
|
+
def put_load(
|
|
204
|
+
self, items: list, record_id: int, load_fn: callable, consumer_id: int
|
|
205
|
+
):
|
|
206
|
+
for i, item in enumerate(items):
|
|
207
|
+
self._load_queues[self._worker_queue_cycle].put(
|
|
208
|
+
(item, i, record_id, load_fn, consumer_id), block=False
|
|
209
|
+
)
|
|
210
|
+
self._worker_queue_cycle = (self._worker_queue_cycle + 1) % self.num_workers
|
|
211
|
+
|
|
212
|
+
def get_load(self, consumer_id):
|
|
213
|
+
self._handle_queues()
|
|
214
|
+
if consumer_id not in self.consumer_load_queues:
|
|
215
|
+
return None
|
|
216
|
+
try:
|
|
217
|
+
result, record_id = self.consumer_load_queues[consumer_id].get(False)
|
|
218
|
+
self.total_consumer_size -= 1
|
|
219
|
+
if result is None:
|
|
220
|
+
return None, record_id
|
|
221
|
+
t, index = result
|
|
222
|
+
t_clone = t.clone()
|
|
223
|
+
del t
|
|
224
|
+
return (t_clone, index), record_id
|
|
225
|
+
except queue.Empty:
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
def put_dump(
|
|
229
|
+
self,
|
|
230
|
+
data: any,
|
|
231
|
+
item_key: str,
|
|
232
|
+
record_id: int,
|
|
233
|
+
dump_fn: callable,
|
|
234
|
+
consumer_id: int,
|
|
235
|
+
):
|
|
236
|
+
self._dump_queues[self._worker_queue_cycle].put(
|
|
237
|
+
(data, item_key, record_id, dump_fn, consumer_id), block=False
|
|
238
|
+
)
|
|
239
|
+
self._worker_queue_cycle = (self._worker_queue_cycle + 1) % self.num_workers
|
|
240
|
+
|
|
241
|
+
def get_dump(self, consumer_id):
|
|
242
|
+
self._handle_queues()
|
|
243
|
+
if consumer_id not in self.consumer_dump_queues:
|
|
244
|
+
return None
|
|
245
|
+
try:
|
|
246
|
+
record_id = self.consumer_dump_queues[consumer_id].get(False)
|
|
247
|
+
self.total_consumer_size -= 1
|
|
248
|
+
return record_id
|
|
249
|
+
except queue.Empty:
|
|
250
|
+
return None
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import graphbook.steps as steps
|
|
2
|
+
import graphbook.resources.base as rbase
|
|
3
|
+
import graphbook.custom_nodes as custom_nodes
|
|
4
|
+
from aiohttp import web
|
|
5
|
+
|
|
6
|
+
default_exported_steps = {
|
|
7
|
+
"Split": steps.Split,
|
|
8
|
+
"SplitNotesByItems": steps.SplitNotesByItems,
|
|
9
|
+
"SplitItemField": steps.SplitItemField,
|
|
10
|
+
"DumpJSONL": steps.DumpJSONL,
|
|
11
|
+
"LoadJSONL": steps.LoadJSONL,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
default_exported_resources = {
|
|
15
|
+
"Text": rbase.Resource,
|
|
16
|
+
"Number": rbase.NumberResource,
|
|
17
|
+
"Function": rbase.FunctionResource,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NodeHub:
|
|
22
|
+
def __init__(self, path):
|
|
23
|
+
self.exported_steps = default_exported_steps
|
|
24
|
+
self.exported_resources = default_exported_resources
|
|
25
|
+
self.custom_node_importer = custom_nodes.CustomNodeImporter(
|
|
26
|
+
path, self.handle_step, self.handle_resource
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def start(self):
|
|
30
|
+
self.custom_node_importer.start_observer()
|
|
31
|
+
|
|
32
|
+
def stop(self):
|
|
33
|
+
self.custom_node_importer.stop_observer()
|
|
34
|
+
|
|
35
|
+
async def handle_step(self, filename, name, step):
|
|
36
|
+
if name in self.exported_steps:
|
|
37
|
+
print(f"Reloading custom step node {name} from {filename}")
|
|
38
|
+
else:
|
|
39
|
+
print(f"Loading custom step node {name} from {filename}")
|
|
40
|
+
self.exported_steps[name] = step
|
|
41
|
+
|
|
42
|
+
async def handle_resource(self, filename, name, resource):
|
|
43
|
+
if name in self.exported_resources:
|
|
44
|
+
print(f"Reloading custom resource node {name} from {filename}")
|
|
45
|
+
else:
|
|
46
|
+
print(f"Loading custom resource node {name} from {filename}")
|
|
47
|
+
self.exported_resources[name] = resource
|
|
48
|
+
|
|
49
|
+
def get_steps(self):
|
|
50
|
+
return self.exported_steps
|
|
51
|
+
|
|
52
|
+
def get_resources(self):
|
|
53
|
+
return self.exported_resources
|
|
54
|
+
|
|
55
|
+
def get_all(self):
|
|
56
|
+
return {"steps": self.get_steps(), "resources": self.get_resources()}
|
|
57
|
+
|
|
58
|
+
def get_exported_nodes(self):
|
|
59
|
+
# Create directory structure for nodes based on their category
|
|
60
|
+
def create_dir_structure(nodes):
|
|
61
|
+
node_tree = {}
|
|
62
|
+
for node_name in nodes:
|
|
63
|
+
node = nodes[node_name]
|
|
64
|
+
category_tree = node["category"].split("/")
|
|
65
|
+
curr_category = node_tree
|
|
66
|
+
for category in category_tree:
|
|
67
|
+
if curr_category.get(category) is None:
|
|
68
|
+
curr_category[category] = {"children": {}}
|
|
69
|
+
curr_category = curr_category[category]["children"]
|
|
70
|
+
curr_category[node_name] = node
|
|
71
|
+
return node_tree
|
|
72
|
+
|
|
73
|
+
steps = {
|
|
74
|
+
k: {
|
|
75
|
+
"name": k,
|
|
76
|
+
"parameters": v.Parameters,
|
|
77
|
+
"inputs": ["in"] if v.RequiresInput else [],
|
|
78
|
+
"outputs": v.Outputs,
|
|
79
|
+
"category": v.Category,
|
|
80
|
+
}
|
|
81
|
+
for k, v in self.get_steps().items()
|
|
82
|
+
}
|
|
83
|
+
resources = {
|
|
84
|
+
k: {
|
|
85
|
+
"name": k,
|
|
86
|
+
"parameters": v.Parameters,
|
|
87
|
+
"category": v.Category,
|
|
88
|
+
}
|
|
89
|
+
for k, v in self.get_resources().items()
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"steps": create_dir_structure(steps),
|
|
94
|
+
"resources": create_dir_structure(resources),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def set_websocket(self, websocket: web.WebSocketResponse):
|
|
98
|
+
self.custom_node_importer.set_websocket(websocket)
|