arbor-ai 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,10 +9,11 @@ class FileResponse(BaseModel):
9
9
  purpose: str
10
10
 
11
11
  class FineTuneRequest(BaseModel):
12
- model_name: str
12
+ model: str
13
13
  training_file: str # id of uploaded jsonl file
14
14
 
15
15
  class JobStatusResponse(BaseModel):
16
- job_id: str
16
+ id: str
17
17
  status: str
18
- details: str = ""
18
+ details: str = ""
19
+ fine_tuned_model: str | None = None
@@ -1,13 +1,23 @@
1
- from fastapi import APIRouter, UploadFile, File, Depends
1
+ from fastapi import APIRouter, UploadFile, File, Depends, HTTPException
2
2
  from arbor.server.services.file_manager import FileManager
3
3
  from arbor.server.api.models.schemas import FileResponse
4
4
  from arbor.server.services.dependencies import get_file_manager
5
+ from arbor.server.services.file_manager import FileValidationError
5
6
 
6
7
  router = APIRouter()
7
8
 
8
9
  @router.post("", response_model=FileResponse)
9
- def upload_file(
10
+ async def upload_file(
10
11
  file: UploadFile = File(...),
11
12
  file_manager: FileManager = Depends(get_file_manager)
12
13
  ):
13
- return file_manager.save_uploaded_file(file)
14
+ if not file.filename.endswith('.jsonl'):
15
+ raise HTTPException(status_code=400, detail="Only .jsonl files are allowed")
16
+
17
+ try:
18
+ content = await file.read()
19
+ file_manager.validate_file_format(content)
20
+ await file.seek(0) # Reset file pointer to beginning
21
+ return file_manager.save_uploaded_file(file)
22
+ except FileValidationError as e:
23
+ raise HTTPException(status_code=400, detail=f"Invalid file format: {str(e)}")
@@ -10,5 +10,5 @@ def get_job_status(
10
10
  job_id: str,
11
11
  job_manager: JobManager = Depends(get_job_manager)
12
12
  ):
13
- status = job_manager.get_job_status(job_id)
14
- return JobStatusResponse(job_id=job_id, status=status.value)
13
+ job = job_manager.get_job(job_id)
14
+ return JobStatusResponse(id=job_id, status=job.status.value, fine_tuned_model=job.fine_tuned_model)
@@ -13,4 +13,4 @@ def fine_tune(request: FineTuneRequest, background_tasks: BackgroundTasks, train
13
13
  job = job_manager.create_job()
14
14
  background_tasks.add_task(training_manager.fine_tune, request, job, file_manager)
15
15
  job.status = JobStatus.QUEUED
16
- return JobStatusResponse(job_id=job.id, status=job.status.value)
16
+ return JobStatusResponse(id=job.id, status=job.status.value)
@@ -7,6 +7,10 @@ import uuid
7
7
  from fastapi import UploadFile
8
8
  from arbor.server.api.models.schemas import FileResponse
9
9
 
10
+ class FileValidationError(Exception):
11
+ """Custom exception for file validation errors"""
12
+ pass
13
+
10
14
  class FileManager:
11
15
  def __init__(self):
12
16
  self.uploads_dir = Path("uploads")
@@ -80,4 +84,45 @@ class FileManager:
80
84
  return FileResponse(**file_data)
81
85
 
82
86
  def get_file(self, file_id: str):
83
- return self.files[file_id]
87
+ return self.files[file_id]
88
+
89
+ def validate_file_format(self, file_content: bytes) -> None:
90
+ """
91
+ Validates that the file content is properly formatted JSONL with expected structure.
92
+ Raises FileValidationError if validation fails.
93
+ """
94
+ if not file_content:
95
+ raise FileValidationError("File is empty")
96
+
97
+ try:
98
+ lines = file_content.decode('utf-8').strip().split('\n')
99
+ if not lines:
100
+ raise FileValidationError("File contains no valid data")
101
+
102
+ for line_num, line in enumerate(lines, 1):
103
+ try:
104
+ data = json.loads(line)
105
+
106
+ # Validate required structure
107
+ if not isinstance(data, dict):
108
+ raise FileValidationError(f"Line {line_num}: Each line must be a JSON object")
109
+
110
+ if "messages" not in data:
111
+ raise FileValidationError(f"Line {line_num}: Missing 'messages' field")
112
+
113
+ if not isinstance(data["messages"], list):
114
+ raise FileValidationError(f"Line {line_num}: 'messages' must be an array")
115
+
116
+ for msg in data["messages"]:
117
+ if not isinstance(msg, dict):
118
+ raise FileValidationError(f"Line {line_num}: Each message must be an object")
119
+ if "role" not in msg or "content" not in msg:
120
+ raise FileValidationError(f"Line {line_num}: Messages must have 'role' and 'content' fields")
121
+ if not isinstance(msg["role"], str) or not isinstance(msg["content"], str):
122
+ raise FileValidationError(f"Line {line_num}: Message 'role' and 'content' must be strings")
123
+
124
+ except json.JSONDecodeError:
125
+ raise FileValidationError(f"Invalid JSON on line {line_num}")
126
+
127
+ except UnicodeDecodeError:
128
+ raise FileValidationError("File must be valid UTF-8 encoded text")
@@ -3,12 +3,15 @@ from enum import Enum
3
3
  import logging
4
4
  from datetime import datetime
5
5
 
6
+ # https://platform.openai.com/docs/api-reference/fine-tuning/object
6
7
  class JobStatus(Enum):
7
- PENDING = "pending"
8
- QUEUED = "queued"
9
- RUNNING = "running"
10
- COMPLETED = "completed"
11
- FAILED = "failed"
8
+ PENDING = "pending" # Not in OAI
9
+ VALIDATING_FILES = "validating_files"
10
+ QUEUED = "queued"
11
+ RUNNING = "running"
12
+ SUCCEEDED = "succeeded"
13
+ FAILED = "failed"
14
+ CANCELLED = "cancelled"
12
15
 
13
16
  class JobLogHandler(logging.Handler):
14
17
  def __init__(self, job):
@@ -27,6 +30,7 @@ class Job:
27
30
  def __init__(self, id: str, status: JobStatus):
28
31
  self.id = id
29
32
  self.status = status
33
+ self.fine_tuned_model = None
30
34
  self.logs = []
31
35
  self.logger = None
32
36
  self.log_handler = None
@@ -61,10 +65,10 @@ class JobManager:
61
65
  def __init__(self):
62
66
  self.jobs = {}
63
67
 
64
- def get_job_status(self, job_id: str):
68
+ def get_job(self, job_id: str):
65
69
  if job_id not in self.jobs:
66
70
  raise ValueError(f"Job {job_id} not found")
67
- return self.jobs[job_id].status
71
+ return self.jobs[job_id]
68
72
 
69
73
  def create_job(self):
70
74
  job = Job(id=str(uuid.uuid4()), status=JobStatus.PENDING)
@@ -12,7 +12,7 @@ class TrainingManager:
12
12
  raise ValueError(f"Training file {request.training_file} not found")
13
13
 
14
14
  data_path = file["path"]
15
- output_dir = f"models/{request.model_name}" # TODO: This should be updated to be unique in some way
15
+ output_dir = f"models/{request.model}" # TODO: This should be updated to be unique in some way
16
16
 
17
17
 
18
18
  default_train_kwargs = {
@@ -59,9 +59,9 @@ class TrainingManager:
59
59
  logger.info(f"Using device: {device}")
60
60
 
61
61
  model = AutoModelForCausalLM.from_pretrained(
62
- pretrained_model_name_or_path=request.model_name
62
+ pretrained_model_name_or_path=request.model
63
63
  ).to(device)
64
- tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=request.model_name)
64
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=request.model)
65
65
 
66
66
  # Set up the chat format; generally only for non-chat model variants, hence the try-except.
67
67
  try:
@@ -166,7 +166,8 @@ class TrainingManager:
166
166
  torch.cuda.empty_cache()
167
167
 
168
168
  logger.info("Training completed successfully")
169
- job.status = JobStatus.COMPLETED
169
+ job.status = JobStatus.SUCCEEDED
170
+ job.fine_tuned_model = sft_config.output_dir
170
171
  except Exception as e:
171
172
  logger.error(f"Training failed: {str(e)}")
172
173
  job.status = JobStatus.FAILED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Noah Ziems
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.3
2
+ Name: arbor-ai
3
+ Version: 0.1.4
4
+ Summary: A framework for fine-tuning and managing language models
5
+ License: MIT
6
+ Keywords: machine learning,fine-tuning,language models
7
+ Author: Noah Ziems
8
+ Author-email: nziems2@nd.edu
9
+ Requires-Python: >=3.9, <3.14
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Requires-Dist: click
20
+ Requires-Dist: fastapi
21
+ Requires-Dist: peft (>=0.14.0,<0.15.0)
22
+ Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
23
+ Requires-Dist: python-multipart (>=0.0.20,<0.0.21)
24
+ Requires-Dist: torch (>=2.6.0,<3.0.0)
25
+ Requires-Dist: transformers (>=4.49.0,<5.0.0)
26
+ Requires-Dist: trl (>=0.15.2,<0.16.0)
27
+ Requires-Dist: uvicorn
28
+ Project-URL: Repository, https://github.com/arbor-ai/arbor
29
+ Description-Content-Type: text/markdown
30
+
31
+ # Arbor 🌳
32
+
33
+ A drop-in replacement for OpenAI's fine-tuning API that lets you fine-tune and manage open-source language models locally. Train and deploy custom models with the same API you already know.
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ pip install arbor-ai
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ 1. Start the Arbor server:
44
+
45
+ ```bash
46
+ arbor serve
47
+ ```
48
+
49
+ 2. The server will be available at `http://localhost:8000`.
50
+
51
+ 3. Upload your training data:
52
+
53
+ ```python
54
+ import requests
55
+
56
+ requests.post('http://127.0.0.1:8000/api/files', files={'file': open('your_file.jsonl', 'rb')})
57
+ ```
58
+
59
+ 4. Submit a fine-tuning job:
60
+
61
+ ```python
62
+ requests.post('http://127.0.0.1:8000/api/fine-tune', json={'model': 'HuggingFaceTB/SmolLM2-135M-Instruct', 'training_file': 'Returned file ID from Step 3'})
63
+ ```
64
+
65
+ 5. Monitor the job status:
66
+
67
+ ```python
68
+ requests.get('http://127.0.0.1:8000/api/jobs/{Returned job ID from Step 4}')
69
+ ```
70
+
71
+
72
+
73
+ ## Development Setup
74
+
75
+ ```bash
76
+ poetry install
77
+ ```
78
+
79
+ ```bash
80
+ poetry run arbor serve
81
+ ```
82
+
83
+ ```bash
84
+ poetry run pytest
85
+ ```
86
+
87
+ ## Contributing
88
+
89
+ Contributions are welcome! Please feel free to submit a Pull Request.
90
+
91
+ ## License
92
+
93
+ This project is licensed under the MIT License - see the LICENSE file for details.
94
+
95
+ ## Support
96
+
97
+ If you encounter any issues or have questions, please file an issue on the [GitHub repository](https://github.com/Ziems/arbor/issues).
@@ -4,23 +4,24 @@ arbor/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  arbor/client/api.py,sha256=WFaNtwCNWXRAHHG1Jfyl7LvTP6jiEyQOLZn2Z8Yjt5k,40
5
5
  arbor/server/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
6
  arbor/server/api/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
7
- arbor/server/api/models/schemas.py,sha256=-_NOnUuEbiO8uLDwEpByYV6NAMasOmFUJXxG0eXA_D0,367
7
+ arbor/server/api/models/schemas.py,sha256=19uDproKWhPQvVTit0hWuqmPb80zrELtCgnLybDuBKw,398
8
8
  arbor/server/api/routes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- arbor/server/api/routes/files.py,sha256=QrPY9-886NXnXjGRlT-pl5kWbnwfogCrdmv6RufJpVg,466
10
- arbor/server/api/routes/jobs.py,sha256=ibL0tQA2Apqa91vycv3NPT0ydhkba4vnPoclw-bVKXs,510
11
- arbor/server/api/routes/training.py,sha256=43NOvh1Hubg3Ocfhu5E82Tp_kXOJL8H8oQXf-4H1yMU,981
9
+ arbor/server/api/routes/files.py,sha256=U5QPC05VzqgDirB77lpy6BJLvg3zo1eGz7RUEk3HgRw,970
10
+ arbor/server/api/routes/jobs.py,sha256=W2Y-rByaULxT0pEy3_YSNWO2CEKR5obyax-uR4ax_6Y,539
11
+ arbor/server/api/routes/training.py,sha256=5M6OAtl9i8L-jBefmvPWvyf1M_x30-IlXzgleBg41Yc,977
12
12
  arbor/server/core/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
13
13
  arbor/server/core/config.py,sha256=R67gNeUXz0RShvpr8XF3Lpn7-RMOfKf2xTIyqXvj4PI,215
14
14
  arbor/server/core/logging.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  arbor/server/main.py,sha256=I3chVYsoG56zE7Clf88lEuOPaDzJvKsOzivOWpsFDls,350
16
16
  arbor/server/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  arbor/server/services/dependencies.py,sha256=y3EoIkwScYc811jZ8p5m0kJT4ixRo7vguimBKKMuxAQ,458
18
- arbor/server/services/file_manager.py,sha256=uE7Mnbn9fC1H7sAMzJt1x9fVak10duh0OyAZeDgi3iY,2200
19
- arbor/server/services/job_manager.py,sha256=Zx3d0h31YH9bQ4yQr3FUXUGEHd-KUiTekZ0ndGOptrY,1893
20
- arbor/server/services/training_manager.py,sha256=SNumrzM1B-V1HBucUHmxnmc4rmhSCHVgPPc5nNPRC4Q,10682
18
+ arbor/server/services/file_manager.py,sha256=VUCn0cUtd-Q1BrUPtKStS1hGtV_OlymUyA0I8zeG9Po,4037
19
+ arbor/server/services/job_manager.py,sha256=rZjuhwwbvL7yCJi653tv7z36iFFvp1w5J9j5DntSWKM,2073
20
+ arbor/server/services/training_manager.py,sha256=BQsUsxOyRlgFDEFM77tyIahmm4NqcoOwxq8Tlmp66dY,10724
21
21
  arbor/server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  arbor/server/utils/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- arbor_ai-0.1.3.dist-info/METADATA,sha256=PJbOddt69fyZXJggzpaSUb5XfUt0ouPrQQFIAaeOasE,1272
24
- arbor_ai-0.1.3.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
25
- arbor_ai-0.1.3.dist-info/entry_points.txt,sha256=AaLg05CZSQeP2oGlCH_AnmZPz-zzLlVtpXToI4cM3kY,39
26
- arbor_ai-0.1.3.dist-info/RECORD,,
23
+ arbor_ai-0.1.4.dist-info/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
24
+ arbor_ai-0.1.4.dist-info/METADATA,sha256=977OGIuruJzS8wkFntELEoO7Ey5VzEhv88v1Pt81pa0,2451
25
+ arbor_ai-0.1.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
26
+ arbor_ai-0.1.4.dist-info/entry_points.txt,sha256=AaLg05CZSQeP2oGlCH_AnmZPz-zzLlVtpXToI4cM3kY,39
27
+ arbor_ai-0.1.4.dist-info/RECORD,,
@@ -1,47 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: arbor-ai
3
- Version: 0.1.3
4
- Summary: A framework for fine-tuning and managing language models
5
- License: MIT
6
- Keywords: machine learning,fine-tuning,language models
7
- Author: Noah Ziems
8
- Author-email: nziems2@nd.edu
9
- Requires-Python: >=3.13
10
- Classifier: Development Status :: 3 - Alpha
11
- Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: MIT License
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.13
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.8
17
- Classifier: Programming Language :: Python :: 3.9
18
- Requires-Dist: click
19
- Requires-Dist: fastapi
20
- Requires-Dist: peft (>=0.14.0,<0.15.0)
21
- Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
22
- Requires-Dist: python-multipart (>=0.0.20,<0.0.21)
23
- Requires-Dist: torch (>=2.6.0,<3.0.0)
24
- Requires-Dist: transformers (>=4.49.0,<5.0.0)
25
- Requires-Dist: trl (>=0.15.2,<0.16.0)
26
- Requires-Dist: uvicorn
27
- Project-URL: Repository, https://github.com/arbor-ai/arbor
28
- Description-Content-Type: text/markdown
29
-
30
- # Arbor AI
31
-
32
- ## Setup
33
-
34
- ```bash
35
- poetry install
36
- ```
37
-
38
- ```bash
39
- poetry run arbor serve
40
- ```
41
-
42
- ## Uploading Data
43
-
44
- ```bash
45
- curl -X POST "http://localhost:8000/api/files" -F "file=@training_data.jsonl"
46
- ```
47
-