marco-dvcs 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- marco_dvcs-0.1.0/PKG-INFO +123 -0
- marco_dvcs-0.1.0/README.md +98 -0
- marco_dvcs-0.1.0/marco/__init__.py +1 -0
- marco_dvcs-0.1.0/marco/__main__.py +4 -0
- marco_dvcs-0.1.0/marco/cli/__init__.py +0 -0
- marco_dvcs-0.1.0/marco/cli/interactive.py +41 -0
- marco_dvcs-0.1.0/marco/cli/main.py +76 -0
- marco_dvcs-0.1.0/marco/cli/web_serve.py +7 -0
- marco_dvcs-0.1.0/marco/core/__init__.py +0 -0
- marco_dvcs-0.1.0/marco/core/locker.py +42 -0
- marco_dvcs-0.1.0/marco/core/preprocessor.py +268 -0
- marco_dvcs-0.1.0/marco/core/repository.py +213 -0
- marco_dvcs-0.1.0/marco/ui/__init__.py +0 -0
- marco_dvcs-0.1.0/marco/web/__init__.py +0 -0
- marco_dvcs-0.1.0/marco/web/app.py +52 -0
- marco_dvcs-0.1.0/marco_dvcs.egg-info/PKG-INFO +123 -0
- marco_dvcs-0.1.0/marco_dvcs.egg-info/SOURCES.txt +22 -0
- marco_dvcs-0.1.0/marco_dvcs.egg-info/dependency_links.txt +1 -0
- marco_dvcs-0.1.0/marco_dvcs.egg-info/entry_points.txt +2 -0
- marco_dvcs-0.1.0/marco_dvcs.egg-info/requires.txt +3 -0
- marco_dvcs-0.1.0/marco_dvcs.egg-info/top_level.txt +1 -0
- marco_dvcs-0.1.0/pyproject.toml +3 -0
- marco_dvcs-0.1.0/setup.cfg +4 -0
- marco_dvcs-0.1.0/setup.py +35 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: marco-dvcs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A minimal dataset versioning system for text data with a focus on reproducibility.
|
|
5
|
+
Home-page: https://github.com/Team-Marco-ACM/marco-package
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: your.email@example.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: pandas
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: Flask
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: author-email
|
|
18
|
+
Dynamic: classifier
|
|
19
|
+
Dynamic: description
|
|
20
|
+
Dynamic: description-content-type
|
|
21
|
+
Dynamic: home-page
|
|
22
|
+
Dynamic: requires-dist
|
|
23
|
+
Dynamic: requires-python
|
|
24
|
+
Dynamic: summary
|
|
25
|
+
|
|
26
|
+
# Marco Dataset Versioning System
|
|
27
|
+
|
|
28
|
+
A minimal dataset versioning system for text data with a strong focus on reproducibility and transparency. Treat your text datasets like code — immutable, versioned, reproducible, and explainable.
|
|
29
|
+
|
|
30
|
+
Marco acts as a lightweight Python library, meaning you can initialize it in *any* machine learning project folder to safely version and preprocess your datasets without altering your original files.
|
|
31
|
+
|
|
32
|
+
## 🚀 Installation (Linux / MacOS)
|
|
33
|
+
|
|
34
|
+
On modern Linux environments (like Arch Linux, Ubuntu 23.04+), Python packages must be installed in a Virtual Environment (PEP 668) to prevent conflicts with your system packages.
|
|
35
|
+
|
|
36
|
+
Follow these steps to safely install Marco into your ML project:
|
|
37
|
+
|
|
38
|
+
1. **Clone this repository** to your local machine:
|
|
39
|
+
```bash
|
|
40
|
+
git clone https://github.com/your-username/marco.git
|
|
41
|
+
cd marco
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
2. **Navigate to the ML project folder** where you want to train your model (e.g. your bag-of-words project):
|
|
45
|
+
```bash
|
|
46
|
+
cd ~/projects/my-bag-of-words-model
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
3. **Create and activate a Python Virtual Environment**:
|
|
50
|
+
```bash
|
|
51
|
+
# Create a virtual environment named 'venv'
|
|
52
|
+
python3 -m venv venv
|
|
53
|
+
|
|
54
|
+
# Activate it (You must do this every time you open a new terminal in this folder)
|
|
55
|
+
source venv/bin/activate
|
|
56
|
+
```
|
|
57
|
+
*(You should now see `(venv)` at the start of your terminal prompt!)*
|
|
58
|
+
|
|
59
|
+
4. **Install Marco**:
|
|
60
|
+
```bash
|
|
61
|
+
# Point pip to the directory where you cloned the marco repository
|
|
62
|
+
pip install -e /path/to/marco
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 🛠️ Usage Guide
|
|
68
|
+
|
|
69
|
+
Once `marco` is installed in your virtual environment, you have access to the full CLI!
|
|
70
|
+
|
|
71
|
+
### 1. Initialize a Repository
|
|
72
|
+
Initialize Marco tracking in your current directory. This creates a `.marco/` data versioning environment specific to that project.
|
|
73
|
+
```bash
|
|
74
|
+
marco init
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 2. Create an Immutable Version
|
|
78
|
+
Upload a text/CSV/TSV dataset to create an immutable version. Marco will compute a cryptographically secure SHA-256 hash using the raw data + the preprocessing configuration.
|
|
79
|
+
|
|
80
|
+
**Interactive Mode:**
|
|
81
|
+
If you don't supply a configuration file, Marco will interactively guide you through building the preprocessing pipeline (Lowercasing, Tokenization, Stopwords Removal, Deduplicating).
|
|
82
|
+
```bash
|
|
83
|
+
marco upload my_dataset.csv -t v1-raw
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Config Mode:**
|
|
87
|
+
```bash
|
|
88
|
+
marco upload my_dataset.csv -c my_config.json -t v1-processed
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 3. List Versions
|
|
92
|
+
View all the versions you've created, along with their tags and timestamps.
|
|
93
|
+
```bash
|
|
94
|
+
marco list
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 4. Start the Visual Dashboard
|
|
98
|
+
Open the Flask-powered web dashboard to view dataset metadata, differences between versions, and complete data lineage trees.
|
|
99
|
+
```bash
|
|
100
|
+
python -m marco.web.app --port 5000
|
|
101
|
+
```
|
|
102
|
+
Open `http://localhost:5000` in your web browser.
|
|
103
|
+
|
|
104
|
+
### 5. Export / Import Versions
|
|
105
|
+
Easily share dataset versions with teammates by packing them into `.tar.gz` files.
|
|
106
|
+
```bash
|
|
107
|
+
# Export version 'v1-raw' to the 'exports' folder
|
|
108
|
+
marco export v1-raw ./exports/
|
|
109
|
+
|
|
110
|
+
# Import an archive sent to you by a coworker
|
|
111
|
+
marco import ./exports/marco_version_e5e0b767.tar.gz
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 🧠 Architecture Overview
|
|
117
|
+
|
|
118
|
+
Marco decouples logic from the file system. All core engine operations sit inside `marco/core/`, including:
|
|
119
|
+
- `locker.py`: File-based concurrency control using `.lock` files.
|
|
120
|
+
- `repository.py`: CRUD operations for dataset versions and `refs.json` tagging.
|
|
121
|
+
- `preprocessor.py`: A robust Directed Acyclic Graph (DAG) preprocessing engine.
|
|
122
|
+
|
|
123
|
+
Have fun building safer machine learning pipelines!
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Marco Dataset Versioning System
|
|
2
|
+
|
|
3
|
+
A minimal dataset versioning system for text data with a strong focus on reproducibility and transparency. Treat your text datasets like code — immutable, versioned, reproducible, and explainable.
|
|
4
|
+
|
|
5
|
+
Marco acts as a lightweight Python library, meaning you can initialize it in *any* machine learning project folder to safely version and preprocess your datasets without altering your original files.
|
|
6
|
+
|
|
7
|
+
## 🚀 Installation (Linux / MacOS)
|
|
8
|
+
|
|
9
|
+
On modern Linux environments (like Arch Linux, Ubuntu 23.04+), Python packages must be installed in a Virtual Environment (PEP 668) to prevent conflicts with your system packages.
|
|
10
|
+
|
|
11
|
+
Follow these steps to safely install Marco into your ML project:
|
|
12
|
+
|
|
13
|
+
1. **Clone this repository** to your local machine:
|
|
14
|
+
```bash
|
|
15
|
+
git clone https://github.com/your-username/marco.git
|
|
16
|
+
cd marco
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
2. **Navigate to the ML project folder** where you want to train your model (e.g. your bag-of-words project):
|
|
20
|
+
```bash
|
|
21
|
+
cd ~/projects/my-bag-of-words-model
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
3. **Create and activate a Python Virtual Environment**:
|
|
25
|
+
```bash
|
|
26
|
+
# Create a virtual environment named 'venv'
|
|
27
|
+
python3 -m venv venv
|
|
28
|
+
|
|
29
|
+
# Activate it (You must do this every time you open a new terminal in this folder)
|
|
30
|
+
source venv/bin/activate
|
|
31
|
+
```
|
|
32
|
+
*(You should now see `(venv)` at the start of your terminal prompt!)*
|
|
33
|
+
|
|
34
|
+
4. **Install Marco**:
|
|
35
|
+
```bash
|
|
36
|
+
# Point pip to the directory where you cloned the marco repository
|
|
37
|
+
pip install -e /path/to/marco
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## 🛠️ Usage Guide
|
|
43
|
+
|
|
44
|
+
Once `marco` is installed in your virtual environment, you have access to the full CLI!
|
|
45
|
+
|
|
46
|
+
### 1. Initialize a Repository
|
|
47
|
+
Initialize Marco tracking in your current directory. This creates a `.marco/` data versioning environment specific to that project.
|
|
48
|
+
```bash
|
|
49
|
+
marco init
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### 2. Create an Immutable Version
|
|
53
|
+
Upload a text/CSV/TSV dataset to create an immutable version. Marco will compute a cryptographically secure SHA-256 hash using the raw data + the preprocessing configuration.
|
|
54
|
+
|
|
55
|
+
**Interactive Mode:**
|
|
56
|
+
If you don't supply a configuration file, Marco will interactively guide you through building the preprocessing pipeline (Lowercasing, Tokenization, Stopwords Removal, Deduplicating).
|
|
57
|
+
```bash
|
|
58
|
+
marco upload my_dataset.csv -t v1-raw
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Config Mode:**
|
|
62
|
+
```bash
|
|
63
|
+
marco upload my_dataset.csv -c my_config.json -t v1-processed
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 3. List Versions
|
|
67
|
+
View all the versions you've created, along with their tags and timestamps.
|
|
68
|
+
```bash
|
|
69
|
+
marco list
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 4. Start the Visual Dashboard
|
|
73
|
+
Open the Flask-powered web dashboard to view dataset metadata, differences between versions, and complete data lineage trees.
|
|
74
|
+
```bash
|
|
75
|
+
python -m marco.web.app --port 5000
|
|
76
|
+
```
|
|
77
|
+
Open `http://localhost:5000` in your web browser.
|
|
78
|
+
|
|
79
|
+
### 5. Export / Import Versions
|
|
80
|
+
Easily share dataset versions with teammates by packing them into `.tar.gz` files.
|
|
81
|
+
```bash
|
|
82
|
+
# Export version 'v1-raw' to the 'exports' folder
|
|
83
|
+
marco export v1-raw ./exports/
|
|
84
|
+
|
|
85
|
+
# Import an archive sent to you by a coworker
|
|
86
|
+
marco import ./exports/marco_version_e5e0b767.tar.gz
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 🧠 Architecture Overview
|
|
92
|
+
|
|
93
|
+
Marco decouples logic from the file system. All core engine operations sit inside `marco/core/`, including:
|
|
94
|
+
- `locker.py`: File-based concurrency control using `.lock` files.
|
|
95
|
+
- `repository.py`: CRUD operations for dataset versions and `refs.json` tagging.
|
|
96
|
+
- `preprocessor.py`: A robust Directed Acyclic Graph (DAG) preprocessing engine.
|
|
97
|
+
|
|
98
|
+
Have fun building safer machine learning pipelines!
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
def build_interactive_config() -> dict:
|
|
4
|
+
print("Welcome to Marco Interactive Dataset Builder!")
|
|
5
|
+
pipeline = []
|
|
6
|
+
|
|
7
|
+
if input("1. Clean newlines? (Y/n): ").strip().lower() != 'n':
|
|
8
|
+
pipeline.append({"func": "normalize_newlines", "params": {}, "depends_on": []})
|
|
9
|
+
|
|
10
|
+
if input("2. Convert to lowercase? (Y/n): ").strip().lower() != 'n':
|
|
11
|
+
pipeline.append({"func": "lowercase", "params": {"enabled": True}, "depends_on": []})
|
|
12
|
+
|
|
13
|
+
if input("3. Tokenize? (Y/n): ").strip().lower() != 'n':
|
|
14
|
+
method = input(" Method [whitespace]: ").strip() or "whitespace"
|
|
15
|
+
pipeline.append({"func": "tokenize", "params": {"method": method}, "depends_on": []})
|
|
16
|
+
|
|
17
|
+
if input("4. Remove Stopwords? (Y/n): ").strip().lower() != 'n':
|
|
18
|
+
lang = input(" Language [english]: ").strip() or "english"
|
|
19
|
+
pipeline.append({"func": "remove_stopwords", "params": {"language": lang}, "depends_on": []})
|
|
20
|
+
|
|
21
|
+
if input("5. Filter by length? (Y/n): ").strip().lower() != 'n':
|
|
22
|
+
min_t = input(" Min tokens [1]: ").strip() or "1"
|
|
23
|
+
max_t = input(" Max tokens [1000]: ").strip() or "1000"
|
|
24
|
+
pipeline.append({"func": "filter_length", "params": {"min_tokens": int(min_t), "max_tokens": int(max_t)}, "depends_on": []})
|
|
25
|
+
|
|
26
|
+
if input("6. Deduplicate? (Y/n): ").strip().lower() != 'n':
|
|
27
|
+
pipeline.append({"func": "deduplicate", "params": {"method": "exact"}, "depends_on": []})
|
|
28
|
+
|
|
29
|
+
dag = {}
|
|
30
|
+
prev = None
|
|
31
|
+
for i, step in enumerate(pipeline):
|
|
32
|
+
step_name = f"step_{i+1}_{step['func']}"
|
|
33
|
+
if prev:
|
|
34
|
+
step["depends_on"] = [prev]
|
|
35
|
+
dag[step_name] = step
|
|
36
|
+
prev = step_name
|
|
37
|
+
|
|
38
|
+
print("\nGenerated Pipeline Configuration:")
|
|
39
|
+
print(json.dumps({"dag": dag}, indent=2))
|
|
40
|
+
|
|
41
|
+
return {"dag": dag}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from marco.core import repository
|
|
6
|
+
from marco.cli.interactive import build_interactive_config
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
parser = argparse.ArgumentParser(prog="marco", description="Marco Dataset Versioning System")
|
|
10
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
11
|
+
|
|
12
|
+
p_init = subparsers.add_parser("init", help="Initialize a marco repository")
|
|
13
|
+
p_init.add_argument("path", nargs="?", default=".", help="Path to initialize")
|
|
14
|
+
|
|
15
|
+
p_upload = subparsers.add_parser("upload", help="Create a dataset version")
|
|
16
|
+
p_upload.add_argument("file", help="Path to raw dataset string/CSV/TSV")
|
|
17
|
+
p_upload.add_argument("-c", "--config", help="Path to config.json")
|
|
18
|
+
p_upload.add_argument("-t", "--tag", help="Tag for the version")
|
|
19
|
+
p_upload.add_argument("-u", "--user", default="unknown", help="User name")
|
|
20
|
+
|
|
21
|
+
subparsers.add_parser("list", help="List all dataset versions")
|
|
22
|
+
|
|
23
|
+
p_export = subparsers.add_parser("export", help="Export a version to tar.gz")
|
|
24
|
+
p_export.add_argument("version", help="Version Hash or Tag")
|
|
25
|
+
p_export.add_argument("dest", help="Destination folder")
|
|
26
|
+
|
|
27
|
+
p_import = subparsers.add_parser("import", help="Import a version from tar.gz")
|
|
28
|
+
p_import.add_argument("tarball", help="Path to exported tar.gz")
|
|
29
|
+
|
|
30
|
+
args = parser.parse_args()
|
|
31
|
+
repo_path = Path.cwd()
|
|
32
|
+
|
|
33
|
+
if args.command == "init":
|
|
34
|
+
p = args.path if args.path else "."
|
|
35
|
+
repository.init_repo(p)
|
|
36
|
+
print(f"Initialized Marco repository in {Path(p).resolve() / '.marco'}")
|
|
37
|
+
|
|
38
|
+
elif args.command == "upload":
|
|
39
|
+
raw_path = Path(args.file)
|
|
40
|
+
if not raw_path.exists():
|
|
41
|
+
print(f"Error: File {raw_path} does not exist.")
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
if args.config:
|
|
45
|
+
config = json.loads(Path(args.config).read_text(encoding='utf-8'))
|
|
46
|
+
else:
|
|
47
|
+
config = build_interactive_config()
|
|
48
|
+
|
|
49
|
+
tags = [args.tag] if args.tag else []
|
|
50
|
+
vid = repository.create_version(raw_path, config, repo_path, user=args.user, tags=tags)
|
|
51
|
+
print(f"Successfully processed and generated version: {vid}")
|
|
52
|
+
|
|
53
|
+
elif args.command == "list":
|
|
54
|
+
try:
|
|
55
|
+
versions = repository.list_versions(repo_path)
|
|
56
|
+
print(f"{'Version':<10} | {'Created At':<20} | {'User':<10} | {'Tags':<20}")
|
|
57
|
+
print("-" * 68)
|
|
58
|
+
for v in versions:
|
|
59
|
+
short = v['version_id'][:8]
|
|
60
|
+
d = v.get('created_at', '')[:19].replace('T', ' ')
|
|
61
|
+
u = v.get('created_by', 'unknown')
|
|
62
|
+
t = ", ".join(v.get('tags', []))
|
|
63
|
+
print(f"{short:<10} | {d:<20} | {u:<10} | {t:<20}")
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"Failed to list versions: {e}")
|
|
66
|
+
|
|
67
|
+
elif args.command == "export":
|
|
68
|
+
tar_path = repository.export_version(args.version, args.dest, repo_path)
|
|
69
|
+
print(f"Exported to {tar_path}")
|
|
70
|
+
|
|
71
|
+
elif args.command == "import":
|
|
72
|
+
vid = repository.import_version(args.tarball, repo_path)
|
|
73
|
+
print(f"Successfully imported version {vid}")
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
main()
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# This file is reserved for Developer 2 to implement the React Web Server logic.
|
|
2
|
+
# The `marco-generate-web` CLI command will point here.
|
|
3
|
+
|
|
4
|
+
def serve_react_app():
|
|
5
|
+
print("React visualization dashboard coming soon!")
|
|
6
|
+
# TODO: Implement local HTTP server for the compiled React App in marco/ui/dist
|
|
7
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import os
|
|
3
|
+
import contextlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
class LockAcquisitionError(Exception):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
@contextlib.contextmanager
|
|
10
|
+
def FileLock(lock_dir: Path, name: str, timeout: int = 10, backoff: float = 0.5):
|
|
11
|
+
"""
|
|
12
|
+
A file-based lock using atomic file creation mode ('x').
|
|
13
|
+
Retries with an exponential-like or constant backoff until timeout.
|
|
14
|
+
"""
|
|
15
|
+
lock_dir = Path(lock_dir)
|
|
16
|
+
lock_dir.mkdir(parents=True, exist_ok=True)
|
|
17
|
+
lock_file = lock_dir / f"{name}.lock"
|
|
18
|
+
|
|
19
|
+
start_time = time.time()
|
|
20
|
+
acquired = False
|
|
21
|
+
|
|
22
|
+
while time.time() - start_time < timeout:
|
|
23
|
+
try:
|
|
24
|
+
# Atomic creation. Fails if file already exists.
|
|
25
|
+
with open(lock_file, "x") as f:
|
|
26
|
+
f.write(str(os.getpid()))
|
|
27
|
+
acquired = True
|
|
28
|
+
break
|
|
29
|
+
except FileExistsError:
|
|
30
|
+
time.sleep(backoff)
|
|
31
|
+
|
|
32
|
+
if not acquired:
|
|
33
|
+
raise LockAcquisitionError(f"Could not acquire lock '{name}' within {timeout} seconds")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
yield
|
|
37
|
+
finally:
|
|
38
|
+
try:
|
|
39
|
+
if lock_file.exists():
|
|
40
|
+
lock_file.unlink()
|
|
41
|
+
except OSError:
|
|
42
|
+
pass
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# scripts/pipeline_dag.py
|
|
2
|
+
import json
|
|
3
|
+
import hashlib
|
|
4
|
+
from collections import defaultdict, deque
|
|
5
|
+
from typing import List, Tuple, Dict, Callable, Any
|
|
6
|
+
|
|
7
|
+
# allowed libs - used elsewhere
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
# Type: data is list of (label, text)
|
|
11
|
+
DataRows = List[Tuple[str, str]]
|
|
12
|
+
NodeSpec = Dict[str, Any]
|
|
13
|
+
PipelineSpec = Dict[str, NodeSpec]
|
|
14
|
+
|
|
15
|
+
ENGLISH_STOPWORDS = {
|
|
16
|
+
"a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at",
|
|
17
|
+
"be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could",
|
|
18
|
+
"couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for",
|
|
19
|
+
"from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's",
|
|
20
|
+
"her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
|
|
21
|
+
"i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't",
|
|
22
|
+
"my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours",
|
|
23
|
+
"ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so",
|
|
24
|
+
"some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
|
|
25
|
+
"these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
|
|
26
|
+
"up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when",
|
|
27
|
+
"when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would",
|
|
28
|
+
"wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# -------------------------
|
|
32
|
+
# Node implementations
|
|
33
|
+
# -------------------------
|
|
34
|
+
# Each node function takes DataRows and params, returns DataRows.
|
|
35
|
+
# Implementations MUST be deterministic.
|
|
36
|
+
|
|
37
|
+
def normalize_newlines_node(data: DataRows, params: dict) -> DataRows:
|
|
38
|
+
out = []
|
|
39
|
+
for lbl, txt in data:
|
|
40
|
+
t = txt.replace('\r\n', '\n').replace('\r', '\n')
|
|
41
|
+
out.append((lbl, t))
|
|
42
|
+
return out
|
|
43
|
+
|
|
44
|
+
def lowercase_node(data: DataRows, params: dict) -> DataRows:
|
|
45
|
+
enabled = params.get('enabled', True)
|
|
46
|
+
if not enabled:
|
|
47
|
+
return data
|
|
48
|
+
return [(lbl, txt.lower()) for lbl, txt in data]
|
|
49
|
+
|
|
50
|
+
def uppercase_custom_node(data: DataRows, params: dict) -> DataRows:
|
|
51
|
+
return [(lbl, txt.upper()) for lbl, txt in data]
|
|
52
|
+
|
|
53
|
+
def tokenize_node(data: DataRows, params: dict) -> DataRows:
|
|
54
|
+
# Deterministic whitespace tokenization; store tokens as space-joined text
|
|
55
|
+
method = params.get('method', 'whitespace')
|
|
56
|
+
out = []
|
|
57
|
+
for lbl, txt in data:
|
|
58
|
+
if method == 'whitespace':
|
|
59
|
+
tokens = [t for t in txt.strip().split() if t]
|
|
60
|
+
else:
|
|
61
|
+
# fallback: whitespace
|
|
62
|
+
tokens = [t for t in txt.strip().split() if t]
|
|
63
|
+
out.append((lbl, ' '.join(tokens)))
|
|
64
|
+
return out
|
|
65
|
+
|
|
66
|
+
def filter_length_node(data: DataRows, params: dict) -> DataRows:
|
|
67
|
+
min_t = int(params.get('min_tokens', 0))
|
|
68
|
+
max_t = int(params.get('max_tokens', 10_000_000))
|
|
69
|
+
out = []
|
|
70
|
+
for lbl, txt in data:
|
|
71
|
+
n = len(txt.split()) if txt else 0
|
|
72
|
+
if min_t <= n <= max_t:
|
|
73
|
+
out.append((lbl, txt))
|
|
74
|
+
return out
|
|
75
|
+
|
|
76
|
+
def remove_stopwords_node(data: DataRows, params: dict) -> DataRows:
|
|
77
|
+
language = params.get('language', 'english')
|
|
78
|
+
# only support english built-in for now
|
|
79
|
+
if language != 'english':
|
|
80
|
+
return data
|
|
81
|
+
|
|
82
|
+
out = []
|
|
83
|
+
for lbl, txt in data:
|
|
84
|
+
tokens = txt.split()
|
|
85
|
+
filtered = [t for t in tokens if t.lower() not in ENGLISH_STOPWORDS]
|
|
86
|
+
out.append((lbl, ' '.join(filtered)))
|
|
87
|
+
return out
|
|
88
|
+
|
|
89
|
+
def deduplicate_node(data: DataRows, params: dict) -> DataRows:
|
|
90
|
+
# Keep first occurrence deterministically
|
|
91
|
+
seen = set()
|
|
92
|
+
out = []
|
|
93
|
+
method = params.get('method', 'exact')
|
|
94
|
+
for lbl, txt in data:
|
|
95
|
+
key = txt if method == 'exact' else hashlib.sha256(txt.encode('utf-8')).hexdigest()
|
|
96
|
+
if key in seen:
|
|
97
|
+
continue
|
|
98
|
+
seen.add(key)
|
|
99
|
+
out.append((lbl, txt))
|
|
100
|
+
return out
|
|
101
|
+
|
|
102
|
+
# node registry maps string name to function
|
|
103
|
+
_NODE_REGISTRY: Dict[str, Callable[[DataRows, dict], DataRows]] = {
|
|
104
|
+
'normalize_newlines': normalize_newlines_node,
|
|
105
|
+
'lowercase': lowercase_node,
|
|
106
|
+
'uppercase_custom': uppercase_custom_node,
|
|
107
|
+
'tokenize': tokenize_node,
|
|
108
|
+
'filter_length': filter_length_node,
|
|
109
|
+
'remove_stopwords': remove_stopwords_node,
|
|
110
|
+
'deduplicate': deduplicate_node,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# -------------------------
|
|
114
|
+
# DAG helpers
|
|
115
|
+
# -------------------------
|
|
116
|
+
def validate_dag(spec: PipelineSpec):
|
|
117
|
+
# Check for cycles using DFS detection
|
|
118
|
+
visited = {}
|
|
119
|
+
def dfs(node):
|
|
120
|
+
if node not in spec:
|
|
121
|
+
raise ValueError(f"Node '{node}' is declared as dependency but missing from pipeline spec")
|
|
122
|
+
if visited.get(node) == 'visiting':
|
|
123
|
+
raise ValueError(f"Cycle detected at node: {node}")
|
|
124
|
+
if visited.get(node) == 'visited':
|
|
125
|
+
return
|
|
126
|
+
visited[node] = 'visiting'
|
|
127
|
+
for dep in spec[node].get('depends_on', []):
|
|
128
|
+
dfs(dep)
|
|
129
|
+
visited[node] = 'visited'
|
|
130
|
+
for n in spec:
|
|
131
|
+
if n not in visited:
|
|
132
|
+
dfs(n)
|
|
133
|
+
return True
|
|
134
|
+
|
|
135
|
+
def topological_sort_order(spec: PipelineSpec) -> List[str]:
|
|
136
|
+
# Kahn's algorithm (deterministic: sort nodes to break ties)
|
|
137
|
+
indeg = {n: 0 for n in spec}
|
|
138
|
+
deps = defaultdict(list)
|
|
139
|
+
for n, s in spec.items():
|
|
140
|
+
for d in s.get('depends_on', []):
|
|
141
|
+
indeg[n] += 1
|
|
142
|
+
deps[d].append(n)
|
|
143
|
+
# start with indeg 0 nodes sorted
|
|
144
|
+
q = deque(sorted([n for n, d in indeg.items() if d == 0]))
|
|
145
|
+
order = []
|
|
146
|
+
while q:
|
|
147
|
+
n = q.popleft()
|
|
148
|
+
order.append(n)
|
|
149
|
+
for child in sorted(deps.get(n, [])):
|
|
150
|
+
indeg[child] -= 1
|
|
151
|
+
if indeg[child] == 0:
|
|
152
|
+
q.append(child)
|
|
153
|
+
if len(order) != len(spec):
|
|
154
|
+
raise ValueError("Cycle or missing nodes detected in pipeline spec")
|
|
155
|
+
return order
|
|
156
|
+
|
|
157
|
+
# -------------------------
|
|
158
|
+
# Metrics helper
|
|
159
|
+
# -------------------------
|
|
160
|
+
def compute_metrics_for_rows(rows: DataRows) -> dict:
|
|
161
|
+
n_docs = len(rows)
|
|
162
|
+
total_tokens = 0
|
|
163
|
+
vocab = {}
|
|
164
|
+
n_empty = 0
|
|
165
|
+
for lbl, txt in rows:
|
|
166
|
+
tokens = txt.split() if txt else []
|
|
167
|
+
if not tokens:
|
|
168
|
+
n_empty += 1
|
|
169
|
+
total_tokens += len(tokens)
|
|
170
|
+
for t in tokens:
|
|
171
|
+
vocab[t] = vocab.get(t, 0) + 1
|
|
172
|
+
return {
|
|
173
|
+
'n_documents': int(n_docs),
|
|
174
|
+
'n_tokens': int(total_tokens),
|
|
175
|
+
'vocab_size': int(len(vocab)),
|
|
176
|
+
'avg_tokens_per_doc': float(total_tokens / n_docs) if n_docs else 0.0,
|
|
177
|
+
'median_tokens_per_doc': float(pd.Series([len(txt.split()) for _, txt in rows]).median()) if n_docs else 0.0,
|
|
178
|
+
'n_empty_docs': int(n_empty),
|
|
179
|
+
'top_tokens': [{'token': t, 'count': c} for t, c in sorted(vocab.items(), key=lambda x: -x[1])[:50]],
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
# -------------------------
|
|
183
|
+
# Runner
|
|
184
|
+
# -------------------------
|
|
185
|
+
def run_pipeline_dag(raw_rows: DataRows, pipeline_spec: PipelineSpec) -> (DataRows, dict, dict):
|
|
186
|
+
"""
|
|
187
|
+
raw_rows: list of (label, text)
|
|
188
|
+
pipeline_spec: dict mapping node_name -> { 'func': <string key in registry>, 'params': {...}, 'depends_on': [...] }
|
|
189
|
+
Returns:
|
|
190
|
+
final_rows: DataRows (concatenation of final nodes' outputs)
|
|
191
|
+
metrics_per_step: dict mapping node_name -> metrics
|
|
192
|
+
dag_structure: minimal structure (nodes + depends_on)
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
# Validate and get order
|
|
196
|
+
validate_dag(pipeline_spec)
|
|
197
|
+
order = topological_sort_order(pipeline_spec)
|
|
198
|
+
|
|
199
|
+
results: Dict[str, DataRows] = {}
|
|
200
|
+
metrics_per_step = {}
|
|
201
|
+
|
|
202
|
+
# raw is the canonical input name
|
|
203
|
+
results['raw'] = raw_rows
|
|
204
|
+
|
|
205
|
+
# run nodes in deterministic order
|
|
206
|
+
for node in order:
|
|
207
|
+
node_spec = pipeline_spec[node]
|
|
208
|
+
func_key = node_spec.get('func')
|
|
209
|
+
params = node_spec.get('params', {})
|
|
210
|
+
depends = node_spec.get('depends_on', [])
|
|
211
|
+
|
|
212
|
+
# gather inputs deterministically: concat in order of depends list; if no deps use raw
|
|
213
|
+
if depends:
|
|
214
|
+
inputs = []
|
|
215
|
+
for d in depends:
|
|
216
|
+
# If dependency not run (shouldn't happen), raise
|
|
217
|
+
if d not in results:
|
|
218
|
+
raise RuntimeError(f"Dependency output missing for {d}")
|
|
219
|
+
inputs.extend(results[d])
|
|
220
|
+
else:
|
|
221
|
+
inputs = results['raw']
|
|
222
|
+
|
|
223
|
+
# get function
|
|
224
|
+
if func_key not in _NODE_REGISTRY:
|
|
225
|
+
raise ValueError(f"Unknown node func '{func_key}' for node '{node}'")
|
|
226
|
+
func = _NODE_REGISTRY[func_key]
|
|
227
|
+
out_rows = func(inputs, params)
|
|
228
|
+
|
|
229
|
+
# store result and metrics
|
|
230
|
+
results[node] = out_rows
|
|
231
|
+
metrics_per_step[node] = compute_metrics_for_rows(out_rows)
|
|
232
|
+
|
|
233
|
+
# Determine final nodes: nodes that are not depended upon by any other node
|
|
234
|
+
depended = set()
|
|
235
|
+
for n, s in pipeline_spec.items():
|
|
236
|
+
for d in s.get('depends_on', []):
|
|
237
|
+
depended.add(d)
|
|
238
|
+
final_nodes = sorted([n for n in pipeline_spec.keys() if n not in depended])
|
|
239
|
+
# Deterministic final output: concat outputs of final nodes in sorted order
|
|
240
|
+
final_output: DataRows = []
|
|
241
|
+
if final_nodes:
|
|
242
|
+
for fn in final_nodes:
|
|
243
|
+
final_output.extend(results[fn])
|
|
244
|
+
else:
|
|
245
|
+
# fallback: last node in order
|
|
246
|
+
final_output = results[order[-1]]
|
|
247
|
+
|
|
248
|
+
dag_structure = {n: {'depends_on': pipeline_spec[n].get('depends_on', []), 'func': pipeline_spec[n].get('func'), 'params': pipeline_spec[n].get('params', {})} for n in pipeline_spec}
|
|
249
|
+
|
|
250
|
+
return final_output, metrics_per_step, dag_structure
|
|
251
|
+
|
|
252
|
+
# -------------------------
|
|
253
|
+
# Serialization helpers for canonical bytes (for output hash)
|
|
254
|
+
# -------------------------
|
|
255
|
+
def canonical_preprocessed_bytes_from_rows(rows: DataRows) -> bytes:
|
|
256
|
+
# deterministic columns: label, text, n_tokens
|
|
257
|
+
import io
|
|
258
|
+
import csv
|
|
259
|
+
buf = io.StringIO()
|
|
260
|
+
writer = csv.writer(buf, delimiter='\t', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
|
|
261
|
+
writer.writerow(['label', 'text', 'n_tokens'])
|
|
262
|
+
for lbl, txt in rows:
|
|
263
|
+
writer.writerow([lbl, txt, str(len(txt.split()) if txt else 0)])
|
|
264
|
+
return buf.getvalue().encode('utf-8')
|
|
265
|
+
|
|
266
|
+
def compute_output_hash_from_rows(rows: DataRows) -> str:
|
|
267
|
+
b = canonical_preprocessed_bytes_from_rows(rows)
|
|
268
|
+
return hashlib.sha256(b).hexdigest()
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import tempfile
|
|
6
|
+
import tarfile
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Union
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from marco.core.preprocessor import run_pipeline_dag, compute_output_hash_from_rows, compute_metrics_for_rows
|
|
14
|
+
from marco.core.locker import FileLock
|
|
15
|
+
|
|
16
|
+
def init_repo(repo_path: Union[str, Path]):
|
|
17
|
+
repo_path = Path(repo_path)
|
|
18
|
+
marco_dir = repo_path / '.marco'
|
|
19
|
+
(marco_dir / 'versions').mkdir(parents=True, exist_ok=True)
|
|
20
|
+
(marco_dir / 'locks').mkdir(parents=True, exist_ok=True)
|
|
21
|
+
refs_file = marco_dir / 'refs.json'
|
|
22
|
+
if not refs_file.exists():
|
|
23
|
+
refs_file.write_text('{}', encoding='utf-8')
|
|
24
|
+
return marco_dir
|
|
25
|
+
|
|
26
|
+
def canonical_json(obj) -> str:
|
|
27
|
+
return json.dumps(obj, sort_keys=True, separators=(',', ':'))
|
|
28
|
+
|
|
29
|
+
def compute_raw_hash(raw_path: Path) -> str:
|
|
30
|
+
hasher = hashlib.sha256()
|
|
31
|
+
with raw_path.open('rb') as f:
|
|
32
|
+
for chunk in iter(lambda: f.read(8192), b''):
|
|
33
|
+
hasher.update(chunk.replace(b"\r\n", b"\n"))
|
|
34
|
+
return hasher.hexdigest()
|
|
35
|
+
|
|
36
|
+
def create_version(raw_path: Path, config: dict, repo_path: Path, user: str = "unknown", tags: List[str] = None, parents: List[str] = None):
|
|
37
|
+
tags = tags or []
|
|
38
|
+
parents = parents or []
|
|
39
|
+
repo_path = Path(repo_path)
|
|
40
|
+
marco_dir = repo_path / '.marco'
|
|
41
|
+
versions_dir = marco_dir / 'versions'
|
|
42
|
+
locks_dir = marco_dir / 'locks'
|
|
43
|
+
|
|
44
|
+
if not marco_dir.exists():
|
|
45
|
+
raise RuntimeError(f"Marco repository not initialized at {repo_path}. Run 'marco init' first.")
|
|
46
|
+
|
|
47
|
+
raw_hash = compute_raw_hash(raw_path)
|
|
48
|
+
canonical = canonical_json(config)
|
|
49
|
+
version_id = hashlib.sha256((raw_hash + ':' + canonical).encode('utf-8')).hexdigest()
|
|
50
|
+
version_folder = versions_dir / version_id
|
|
51
|
+
|
|
52
|
+
with FileLock(locks_dir, "create_version"):
|
|
53
|
+
if version_folder.exists():
|
|
54
|
+
print(f"Version {version_id} already exists.")
|
|
55
|
+
if tags:
|
|
56
|
+
tag_version(version_id, tags, repo_path)
|
|
57
|
+
return version_id
|
|
58
|
+
|
|
59
|
+
text = raw_path.read_text(encoding='utf-8', errors='replace')
|
|
60
|
+
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
|
61
|
+
lines = [ln for ln in text.split('\n') if ln.strip()]
|
|
62
|
+
|
|
63
|
+
raw_rows = []
|
|
64
|
+
for ln in lines:
|
|
65
|
+
if '\t' in ln:
|
|
66
|
+
lbl, txt = ln.split('\t', 1)
|
|
67
|
+
else:
|
|
68
|
+
lbl, txt = '', ln
|
|
69
|
+
raw_rows.append((lbl.strip(), txt.strip()))
|
|
70
|
+
|
|
71
|
+
pipeline_spec = config.get('dag') or {}
|
|
72
|
+
if not pipeline_spec:
|
|
73
|
+
raise ValueError("Config must include 'dag' mapping of nodes")
|
|
74
|
+
|
|
75
|
+
final_rows, metrics_per_step, dag_structure = run_pipeline_dag(raw_rows, pipeline_spec)
|
|
76
|
+
|
|
77
|
+
final_metrics = compute_metrics_for_rows(final_rows)
|
|
78
|
+
output_hash = compute_output_hash_from_rows(final_rows)
|
|
79
|
+
final_metrics['output_hash'] = output_hash
|
|
80
|
+
final_metrics['created_by'] = user
|
|
81
|
+
|
|
82
|
+
df = pd.DataFrame([{'label': l, 'text': t, 'n_tokens': len(t.split()) if t else 0} for l, t in final_rows])
|
|
83
|
+
|
|
84
|
+
manifest = {
|
|
85
|
+
'version_id': version_id,
|
|
86
|
+
'created_at': datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
|
87
|
+
'created_by': user,
|
|
88
|
+
'raw_hash': raw_hash,
|
|
89
|
+
'config_hash': hashlib.sha256(canonical.encode('utf-8')).hexdigest(),
|
|
90
|
+
'n_raw_lines': len(lines),
|
|
91
|
+
'parents': parents,
|
|
92
|
+
'output_hash': output_hash,
|
|
93
|
+
'dag': dag_structure
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
tmp = Path(tempfile.mkdtemp(prefix='version_tmp_', dir=versions_dir))
|
|
97
|
+
try:
|
|
98
|
+
shutil.copy2(raw_path, tmp / 'raw.txt')
|
|
99
|
+
(tmp / 'config.json').write_text(canonical, encoding='utf-8')
|
|
100
|
+
(tmp / 'raw_hash.txt').write_text(raw_hash, encoding='utf-8')
|
|
101
|
+
df.to_csv(tmp / 'preprocessed.tsv', sep='\t', index=False, header=True, encoding='utf-8', lineterminator='\n')
|
|
102
|
+
final_metrics['metrics_per_step'] = metrics_per_step
|
|
103
|
+
(tmp / 'metrics.json').write_text(json.dumps(final_metrics, indent=2), encoding='utf-8')
|
|
104
|
+
(tmp / 'dag_structure.json').write_text(json.dumps(dag_structure, indent=2), encoding='utf-8')
|
|
105
|
+
(tmp / 'manifest.json').write_text(json.dumps(manifest, indent=2), encoding='utf-8')
|
|
106
|
+
|
|
107
|
+
os.replace(str(tmp), str(version_folder))
|
|
108
|
+
except Exception:
|
|
109
|
+
shutil.rmtree(tmp, ignore_errors=True)
|
|
110
|
+
raise
|
|
111
|
+
|
|
112
|
+
if tags:
|
|
113
|
+
tag_version(version_id, tags, repo_path)
|
|
114
|
+
|
|
115
|
+
return version_id
|
|
116
|
+
|
|
117
|
+
def tag_version(version_id: str, tags: List[str], repo_path: Path):
|
|
118
|
+
repo_path = Path(repo_path)
|
|
119
|
+
refs_file = repo_path / '.marco' / 'refs.json'
|
|
120
|
+
locks_dir = repo_path / '.marco' / 'locks'
|
|
121
|
+
|
|
122
|
+
with FileLock(locks_dir, "refs_update"):
|
|
123
|
+
refs = {}
|
|
124
|
+
if refs_file.exists():
|
|
125
|
+
refs = json.loads(refs_file.read_text(encoding='utf-8'))
|
|
126
|
+
for t in tags:
|
|
127
|
+
refs[t] = version_id
|
|
128
|
+
refs_file.write_text(json.dumps(refs, indent=2), encoding='utf-8')
|
|
129
|
+
|
|
130
|
+
def list_tags(repo_path: Path) -> Dict[str, str]:
|
|
131
|
+
refs_file = Path(repo_path) / '.marco' / 'refs.json'
|
|
132
|
+
if refs_file.exists():
|
|
133
|
+
return json.loads(refs_file.read_text(encoding='utf-8'))
|
|
134
|
+
return {}
|
|
135
|
+
|
|
136
|
+
def resolve_version(ref: str, repo_path: Path) -> str:
|
|
137
|
+
repo_path = Path(repo_path)
|
|
138
|
+
versions_dir = repo_path / '.marco' / 'versions'
|
|
139
|
+
|
|
140
|
+
if (versions_dir / ref).exists():
|
|
141
|
+
return ref
|
|
142
|
+
|
|
143
|
+
tags = list_tags(repo_path)
|
|
144
|
+
if ref in tags:
|
|
145
|
+
return tags[ref]
|
|
146
|
+
|
|
147
|
+
matches = [d.name for d in versions_dir.iterdir() if d.is_dir() and d.name.startswith(ref)]
|
|
148
|
+
if len(matches) == 1:
|
|
149
|
+
return matches[0]
|
|
150
|
+
elif len(matches) > 1:
|
|
151
|
+
raise ValueError(f"Ambiguous short hash '{ref}'. Matches: {matches}")
|
|
152
|
+
|
|
153
|
+
raise ValueError(f"Version '{ref}' not found.")
|
|
154
|
+
|
|
155
|
+
def list_versions(repo_path: Path) -> List[dict]:
|
|
156
|
+
versions_dir = Path(repo_path) / '.marco' / 'versions'
|
|
157
|
+
if not versions_dir.exists():
|
|
158
|
+
return []
|
|
159
|
+
|
|
160
|
+
tags = list_tags(repo_path)
|
|
161
|
+
tag_map = {}
|
|
162
|
+
for t, v in tags.items():
|
|
163
|
+
tag_map.setdefault(v, []).append(t)
|
|
164
|
+
|
|
165
|
+
out = []
|
|
166
|
+
for d in versions_dir.iterdir():
|
|
167
|
+
if d.is_dir() and (d / 'manifest.json').exists():
|
|
168
|
+
man = json.loads((d / 'manifest.json').read_text(encoding='utf-8'))
|
|
169
|
+
man['tags'] = tag_map.get(d.name, [])
|
|
170
|
+
out.append(man)
|
|
171
|
+
out.sort(key=lambda x: x.get('created_at', ''), reverse=True)
|
|
172
|
+
return out
|
|
173
|
+
|
|
174
|
+
def export_version(version_ref: str, dest_dir: Path, repo_path: Path):
|
|
175
|
+
vid = resolve_version(version_ref, repo_path)
|
|
176
|
+
version_dir = Path(repo_path) / '.marco' / 'versions' / vid
|
|
177
|
+
dest_dir = Path(dest_dir)
|
|
178
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
tar_path = dest_dir / f"marco_version_{vid[:8]}.tar.gz"
|
|
181
|
+
with tarfile.open(tar_path, "w:gz") as tar:
|
|
182
|
+
tar.add(version_dir, arcname=vid)
|
|
183
|
+
return tar_path
|
|
184
|
+
|
|
185
|
+
def import_version(tar_path: Path, repo_path: Path):
|
|
186
|
+
tar_path = Path(tar_path)
|
|
187
|
+
repo_path = Path(repo_path)
|
|
188
|
+
versions_dir = repo_path / '.marco' / 'versions'
|
|
189
|
+
locks_dir = repo_path / '.marco' / 'locks'
|
|
190
|
+
|
|
191
|
+
if not versions_dir.exists():
|
|
192
|
+
raise RuntimeError("Marco repo not initialized.")
|
|
193
|
+
|
|
194
|
+
with FileLock(locks_dir, "import_version"):
|
|
195
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
196
|
+
with tarfile.open(tar_path, "r:gz") as tar:
|
|
197
|
+
for member in tar.getmembers():
|
|
198
|
+
if member.name.startswith('/') or '..' in member.name:
|
|
199
|
+
raise ValueError("Tarball contains illegal paths.")
|
|
200
|
+
tar.extractall(path=tmpdir)
|
|
201
|
+
|
|
202
|
+
extracted = list(Path(tmpdir).iterdir())
|
|
203
|
+
if len(extracted) != 1 or not extracted[0].is_dir() or len(extracted[0].name) != 64:
|
|
204
|
+
raise ValueError("Invalid marco tarball format.")
|
|
205
|
+
|
|
206
|
+
vid = extracted[0].name
|
|
207
|
+
dest_dir = versions_dir / vid
|
|
208
|
+
if dest_dir.exists():
|
|
209
|
+
print(f"Version {vid} already exists.")
|
|
210
|
+
return vid
|
|
211
|
+
|
|
212
|
+
shutil.copytree(extracted[0], dest_dir)
|
|
213
|
+
return vid
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from flask import Flask, render_template, request, jsonify
|
|
4
|
+
from marco.core import repository
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
app = Flask(__name__)
|
|
8
|
+
REPO_PATH = Path.cwd()
|
|
9
|
+
|
|
10
|
+
@app.route('/')
|
|
11
|
+
def index():
|
|
12
|
+
try:
|
|
13
|
+
versions = repository.list_versions(REPO_PATH)
|
|
14
|
+
except Exception:
|
|
15
|
+
versions = []
|
|
16
|
+
return render_template('index.html', versions=versions)
|
|
17
|
+
|
|
18
|
+
@app.route('/version/<vid>')
|
|
19
|
+
def version_detail(vid):
|
|
20
|
+
repo_path = REPO_PATH
|
|
21
|
+
try:
|
|
22
|
+
resolved_vid = repository.resolve_version(vid, repo_path)
|
|
23
|
+
except ValueError:
|
|
24
|
+
return "Version not found", 404
|
|
25
|
+
|
|
26
|
+
v_dir = repo_path / '.marco' / 'versions' / resolved_vid
|
|
27
|
+
manifest = json.loads((v_dir / 'manifest.json').read_text(encoding='utf-8'))
|
|
28
|
+
metrics = json.loads((v_dir / 'metrics.json').read_text(encoding='utf-8'))
|
|
29
|
+
config = json.loads((v_dir / 'config.json').read_text(encoding='utf-8'))
|
|
30
|
+
|
|
31
|
+
tags = repository.list_tags(repo_path)
|
|
32
|
+
v_tags = [t for t, v in tags.items() if v == resolved_vid]
|
|
33
|
+
|
|
34
|
+
return render_template('version.html', manifest=manifest, metrics=metrics, config=config, tags=v_tags)
|
|
35
|
+
|
|
36
|
+
@app.route('/api/versions')
|
|
37
|
+
def api_versions():
|
|
38
|
+
return jsonify(repository.list_versions(REPO_PATH))
|
|
39
|
+
|
|
40
|
+
def main():
|
|
41
|
+
parser = argparse.ArgumentParser(description="Marco Web Dashboard")
|
|
42
|
+
parser.add_argument("--port", type=int, default=5000)
|
|
43
|
+
parser.add_argument("--repo", type=str, default=".")
|
|
44
|
+
args = parser.parse_args()
|
|
45
|
+
|
|
46
|
+
global REPO_PATH
|
|
47
|
+
REPO_PATH = Path(args.repo).resolve()
|
|
48
|
+
|
|
49
|
+
app.run(port=args.port, debug=True)
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
main()
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: marco-dvcs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A minimal dataset versioning system for text data with a focus on reproducibility.
|
|
5
|
+
Home-page: https://github.com/Team-Marco-ACM/marco-package
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: your.email@example.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: pandas
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: Flask
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: author-email
|
|
18
|
+
Dynamic: classifier
|
|
19
|
+
Dynamic: description
|
|
20
|
+
Dynamic: description-content-type
|
|
21
|
+
Dynamic: home-page
|
|
22
|
+
Dynamic: requires-dist
|
|
23
|
+
Dynamic: requires-python
|
|
24
|
+
Dynamic: summary
|
|
25
|
+
|
|
26
|
+
# Marco Dataset Versioning System
|
|
27
|
+
|
|
28
|
+
A minimal dataset versioning system for text data with a strong focus on reproducibility and transparency. Treat your text datasets like code — immutable, versioned, reproducible, and explainable.
|
|
29
|
+
|
|
30
|
+
Marco acts as a lightweight Python library, meaning you can initialize it in *any* machine learning project folder to safely version and preprocess your datasets without altering your original files.
|
|
31
|
+
|
|
32
|
+
## 🚀 Installation (Linux / MacOS)
|
|
33
|
+
|
|
34
|
+
On modern Linux environments (like Arch Linux, Ubuntu 23.04+), Python packages must be installed in a Virtual Environment (PEP 668) to prevent conflicts with your system packages.
|
|
35
|
+
|
|
36
|
+
Follow these steps to safely install Marco into your ML project:
|
|
37
|
+
|
|
38
|
+
1. **Clone this repository** to your local machine:
|
|
39
|
+
```bash
|
|
40
|
+
git clone https://github.com/your-username/marco.git
|
|
41
|
+
cd marco
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
2. **Navigate to the ML project folder** where you want to train your model (e.g. your bag-of-words project):
|
|
45
|
+
```bash
|
|
46
|
+
cd ~/projects/my-bag-of-words-model
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
3. **Create and activate a Python Virtual Environment**:
|
|
50
|
+
```bash
|
|
51
|
+
# Create a virtual environment named 'venv'
|
|
52
|
+
python3 -m venv venv
|
|
53
|
+
|
|
54
|
+
# Activate it (You must do this every time you open a new terminal in this folder)
|
|
55
|
+
source venv/bin/activate
|
|
56
|
+
```
|
|
57
|
+
*(You should now see `(venv)` at the start of your terminal prompt!)*
|
|
58
|
+
|
|
59
|
+
4. **Install Marco**:
|
|
60
|
+
```bash
|
|
61
|
+
# Point pip to the directory where you cloned the marco repository
|
|
62
|
+
pip install -e /path/to/marco
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 🛠️ Usage Guide
|
|
68
|
+
|
|
69
|
+
Once `marco` is installed in your virtual environment, you have access to the full CLI!
|
|
70
|
+
|
|
71
|
+
### 1. Initialize a Repository
|
|
72
|
+
Initialize Marco tracking in your current directory. This creates a `.marco/` data versioning environment specific to that project.
|
|
73
|
+
```bash
|
|
74
|
+
marco init
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 2. Create an Immutable Version
|
|
78
|
+
Upload a text/CSV/TSV dataset to create an immutable version. Marco will compute a cryptographically secure SHA-256 hash using the raw data + the preprocessing configuration.
|
|
79
|
+
|
|
80
|
+
**Interactive Mode:**
|
|
81
|
+
If you don't supply a configuration file, Marco will interactively guide you through building the preprocessing pipeline (Lowercasing, Tokenization, Stopwords Removal, Deduplicating).
|
|
82
|
+
```bash
|
|
83
|
+
marco upload my_dataset.csv -t v1-raw
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Config Mode:**
|
|
87
|
+
```bash
|
|
88
|
+
marco upload my_dataset.csv -c my_config.json -t v1-processed
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 3. List Versions
|
|
92
|
+
View all the versions you've created, along with their tags and timestamps.
|
|
93
|
+
```bash
|
|
94
|
+
marco list
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 4. Start the Visual Dashboard
|
|
98
|
+
Open the Flask-powered web dashboard to view dataset metadata, differences between versions, and complete data lineage trees.
|
|
99
|
+
```bash
|
|
100
|
+
python -m marco.web.app --port 5000
|
|
101
|
+
```
|
|
102
|
+
Open `http://localhost:5000` in your web browser.
|
|
103
|
+
|
|
104
|
+
### 5. Export / Import Versions
|
|
105
|
+
Easily share dataset versions with teammates by packing them into `.tar.gz` files.
|
|
106
|
+
```bash
|
|
107
|
+
# Export version 'v1-raw' to the 'exports' folder
|
|
108
|
+
marco export v1-raw ./exports/
|
|
109
|
+
|
|
110
|
+
# Import an archive sent to you by a coworker
|
|
111
|
+
marco import ./exports/marco_version_e5e0b767.tar.gz
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 🧠 Architecture Overview
|
|
117
|
+
|
|
118
|
+
Marco decouples logic from the file system. All core engine operations sit inside `marco/core/`, including:
|
|
119
|
+
- `locker.py`: File-based concurrency control using `.lock` files.
|
|
120
|
+
- `repository.py`: CRUD operations for dataset versions and `refs.json` tagging.
|
|
121
|
+
- `preprocessor.py`: A robust Directed Acyclic Graph (DAG) preprocessing engine.
|
|
122
|
+
|
|
123
|
+
Have fun building safer machine learning pipelines!
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
marco/__init__.py
|
|
5
|
+
marco/__main__.py
|
|
6
|
+
marco/cli/__init__.py
|
|
7
|
+
marco/cli/interactive.py
|
|
8
|
+
marco/cli/main.py
|
|
9
|
+
marco/cli/web_serve.py
|
|
10
|
+
marco/core/__init__.py
|
|
11
|
+
marco/core/locker.py
|
|
12
|
+
marco/core/preprocessor.py
|
|
13
|
+
marco/core/repository.py
|
|
14
|
+
marco/ui/__init__.py
|
|
15
|
+
marco/web/__init__.py
|
|
16
|
+
marco/web/app.py
|
|
17
|
+
marco_dvcs.egg-info/PKG-INFO
|
|
18
|
+
marco_dvcs.egg-info/SOURCES.txt
|
|
19
|
+
marco_dvcs.egg-info/dependency_links.txt
|
|
20
|
+
marco_dvcs.egg-info/entry_points.txt
|
|
21
|
+
marco_dvcs.egg-info/requires.txt
|
|
22
|
+
marco_dvcs.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
marco
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from setuptools import setup, find_packages
|
|
3
|
+
|
|
4
|
+
# Read the contents of your README file
|
|
5
|
+
this_directory = os.path.abspath(os.path.dirname(__file__))
|
|
6
|
+
with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
|
7
|
+
long_description = f.read()
|
|
8
|
+
|
|
9
|
+
setup(
|
|
10
|
+
name="marco-dvcs", # Changed to 'marco-dvcs' as 'marco' might be taken on PyPI
|
|
11
|
+
version="0.1.0",
|
|
12
|
+
author="Your Name",
|
|
13
|
+
author_email="your.email@example.com",
|
|
14
|
+
description="A minimal dataset versioning system for text data with a focus on reproducibility.",
|
|
15
|
+
long_description=long_description,
|
|
16
|
+
long_description_content_type="text/markdown",
|
|
17
|
+
url="https://github.com/Team-Marco-ACM/marco-package",
|
|
18
|
+
packages=find_packages(),
|
|
19
|
+
install_requires=[
|
|
20
|
+
"pandas",
|
|
21
|
+
"numpy",
|
|
22
|
+
"Flask",
|
|
23
|
+
],
|
|
24
|
+
entry_points={
|
|
25
|
+
"console_scripts": [
|
|
26
|
+
"marco=marco.cli.main:main",
|
|
27
|
+
],
|
|
28
|
+
},
|
|
29
|
+
classifiers=[
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"License :: OSI Approved :: MIT License",
|
|
32
|
+
"Operating System :: OS Independent",
|
|
33
|
+
],
|
|
34
|
+
python_requires=">=3.8",
|
|
35
|
+
)
|