napsack 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napsack-0.1.0/PKG-INFO +105 -0
- napsack-0.1.0/README.md +76 -0
- napsack-0.1.0/pyproject.toml +52 -0
- napsack-0.1.0/setup.cfg +4 -0
- napsack-0.1.0/src/label/__init__.py +0 -0
- napsack-0.1.0/src/label/__main__.py +210 -0
- napsack-0.1.0/src/label/caption_matching.py +150 -0
- napsack-0.1.0/src/label/clients/__init__.py +28 -0
- napsack-0.1.0/src/label/clients/bigquery.py +171 -0
- napsack-0.1.0/src/label/clients/client.py +28 -0
- napsack-0.1.0/src/label/clients/gemini.py +84 -0
- napsack-0.1.0/src/label/clients/vllm.py +137 -0
- napsack-0.1.0/src/label/discovery.py +116 -0
- napsack-0.1.0/src/label/models.py +533 -0
- napsack-0.1.0/src/label/processor.py +642 -0
- napsack-0.1.0/src/label/prompts/default.txt +98 -0
- napsack-0.1.0/src/label/prompts/screenshots_only.txt +87 -0
- napsack-0.1.0/src/label/video.py +323 -0
- napsack-0.1.0/src/label/visualizer.py +280 -0
- napsack-0.1.0/src/napsack/__init__.py +2 -0
- napsack-0.1.0/src/napsack.egg-info/PKG-INFO +105 -0
- napsack-0.1.0/src/napsack.egg-info/SOURCES.txt +46 -0
- napsack-0.1.0/src/napsack.egg-info/dependency_links.txt +1 -0
- napsack-0.1.0/src/napsack.egg-info/entry_points.txt +3 -0
- napsack-0.1.0/src/napsack.egg-info/requires.txt +22 -0
- napsack-0.1.0/src/napsack.egg-info/top_level.txt +3 -0
- napsack-0.1.0/src/record/__init__.py +0 -0
- napsack-0.1.0/src/record/__main__.py +413 -0
- napsack-0.1.0/src/record/constants.py +112 -0
- napsack-0.1.0/src/record/handlers/__init__.py +7 -0
- napsack-0.1.0/src/record/handlers/accessibility.py +227 -0
- napsack-0.1.0/src/record/handlers/input_event.py +269 -0
- napsack-0.1.0/src/record/handlers/screenshot.py +87 -0
- napsack-0.1.0/src/record/models/__init__.py +16 -0
- napsack-0.1.0/src/record/models/aggregation.py +51 -0
- napsack-0.1.0/src/record/models/event.py +35 -0
- napsack-0.1.0/src/record/models/event_queue.py +503 -0
- napsack-0.1.0/src/record/models/image.py +23 -0
- napsack-0.1.0/src/record/models/image_queue.py +118 -0
- napsack-0.1.0/src/record/monitor/__init__.py +9 -0
- napsack-0.1.0/src/record/monitor/reader.py +101 -0
- napsack-0.1.0/src/record/monitor/summary.py +402 -0
- napsack-0.1.0/src/record/monitor/viewer.py +393 -0
- napsack-0.1.0/src/record/sanitize.py +224 -0
- napsack-0.1.0/src/record/workers/__init__.py +10 -0
- napsack-0.1.0/src/record/workers/aggregation.py +157 -0
- napsack-0.1.0/src/record/workers/save.py +104 -0
- napsack-0.1.0/src/record/workers/screenshot.py +136 -0
napsack-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: napsack
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened.
|
|
5
|
+
Requires-Python: <=3.13,>=3.11
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: mss<11.0.0,>=10.0.0
|
|
8
|
+
Requires-Dist: numpy==2.2
|
|
9
|
+
Requires-Dist: opencv-python<5.0.0.0,>=4.11.0.86
|
|
10
|
+
Requires-Dist: pandas<3.0.0,>=2.3.0
|
|
11
|
+
Requires-Dist: matplotlib<4.0.0,>=3.10.3
|
|
12
|
+
Requires-Dist: scikit-learn<2.0.0,>=1.7.0
|
|
13
|
+
Requires-Dist: ruptures<2.0.0,>=1.1.9
|
|
14
|
+
Requires-Dist: ipdb<0.14.0,>=0.13.13
|
|
15
|
+
Requires-Dist: plotly<7.0.0,>=6.2.0
|
|
16
|
+
Requires-Dist: nbformat<6.0.0,>=5.10.4
|
|
17
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
18
|
+
Requires-Dist: google-generativeai>=0.8.5
|
|
19
|
+
Requires-Dist: datasets<4.0.0,>=3.0.0
|
|
20
|
+
Requires-Dist: pillow>=11.3.0
|
|
21
|
+
Requires-Dist: imageio>=2.37.0
|
|
22
|
+
Requires-Dist: screeninfo>=0.8.1
|
|
23
|
+
Requires-Dist: pynput>=1.8.1
|
|
24
|
+
Requires-Dist: scikit-image>=0.25.2
|
|
25
|
+
Requires-Dist: google-genai>=1.45.0
|
|
26
|
+
Requires-Dist: openai>=2.8.1
|
|
27
|
+
Requires-Dist: google-cloud-storage>=3.6.0
|
|
28
|
+
Requires-Dist: google-cloud-bigquery>=3.38.0
|
|
29
|
+
|
|
30
|
+
# NAPsack
|
|
31
|
+
|
|
32
|
+
**NAPsack** records and structures your computer use by generating natural language caption from screenshots and input events (click, keypress, scroll, cursor move).
|
|
33
|
+
|
|
34
|
+
<img alt="napsack_overview" src="https://github.com/user-attachments/assets/dd9ca2c5-288c-4977-8dc9-10ca343e56db" />
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
# Quickstart
|
|
38
|
+
|
|
39
|
+
> Requires Python 3.11+ and `ffmpeg` for video generation. Use `uv` to run the commands below.
|
|
40
|
+
|
|
41
|
+
## API Keys
|
|
42
|
+
|
|
43
|
+
NAPsack uses a VLM to generate captions. Create a `.env` file in the project root (or export variables in your shell):
|
|
44
|
+
|
|
45
|
+
```shell
|
|
46
|
+
cp .env.example .env
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Then fill in the key for your chosen client:
|
|
50
|
+
|
|
51
|
+
| Client | Variable | Where to get it |
|
|
52
|
+
|--------|----------|-----------------|
|
|
53
|
+
| `gemini` (default) | `GEMINI_API_KEY` | [Google AI Studio](https://aistudio.google.com/apikey) |
|
|
54
|
+
| `vllm` | _(none — pass `--vllm-url`)_ | Self-hosted vLLM server |
|
|
55
|
+
| `bigquery` | _(uses Application Default Credentials)_ | `gcloud auth application-default login` |
|
|
56
|
+
|
|
57
|
+
For Gemini, your `.env` should contain:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
GEMINI_API_KEY=your_key_here
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Record** a session (press CTRL+C to stop)
|
|
64
|
+
```shell
|
|
65
|
+
uv run -m record --monitor
|
|
66
|
+
```
|
|
67
|
+
**Label** the recorded session
|
|
68
|
+
```shell
|
|
69
|
+
uv run -m label --session logs/session_name --client gemini
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
> NAPsack supports `gemini` and `vllm` for data labeling and integrates with `big query`
|
|
73
|
+
|
|
74
|
+
# Output
|
|
75
|
+
|
|
76
|
+
```shell
|
|
77
|
+
logs/session_name
|
|
78
|
+
├── screenshots # Recorded screenshots
|
|
79
|
+
├── aggregations.jsonl # Recorded event bursts
|
|
80
|
+
├── captions.jsonl # All VLM-generated captions
|
|
81
|
+
├── annotated.mp4 # Final video showing generated captions and input events
|
|
82
|
+
└── data.jsonl # Final data containing raw input events and LLM generated captions
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
# Method
|
|
86
|
+
|
|
87
|
+
## Record
|
|
88
|
+
|
|
89
|
+
NAPsack groups temporally adjacent input events of the same type into **event bursts**. An event is assigned to the current burst if the time since the preceding event of that type does not exceed the corresponding **gap** threshold and the elapsed time since the burst start remains within the **max** duration.
|
|
90
|
+
* If the **gap** threshold is exceeded, a new burst is started.
|
|
91
|
+
* If the **max** duration is exceeded, the first half of the current burst is finalized and saved, while the second half becomes the active burst.
|
|
92
|
+
A burst is force-restarted when the active monitor changes.
|
|
93
|
+
|
|
94
|
+
## Label
|
|
95
|
+
|
|
96
|
+
The `label` module:
|
|
97
|
+
|
|
98
|
+
* Loads sessions or raw screenshots and chunks.
|
|
99
|
+
* Uses prompts (in `label/prompts`) to instruct the VLM to generate captions that describe the user's actions and context.
|
|
100
|
+
* Produces `captions.jsonl` and `data.jsonl` (captions aligned to screenshots and events).
|
|
101
|
+
* Optionally renders an annotated video (`annotated.mp4`) showing captions and event visualizations overlayed on frames.
|
|
102
|
+
|
|
103
|
+
The label step performs a second layer of aggregation: it uses the bursts detected at recording time and further refines and annotates them with VLM outputs to create final human-readable summaries.
|
|
104
|
+
|
|
105
|
+
|
napsack-0.1.0/README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# NAPsack
|
|
2
|
+
|
|
3
|
+
**NAPsack** records and structures your computer use by generating natural language caption from screenshots and input events (click, keypress, scroll, cursor move).
|
|
4
|
+
|
|
5
|
+
<img alt="napsack_overview" src="https://github.com/user-attachments/assets/dd9ca2c5-288c-4977-8dc9-10ca343e56db" />
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
# Quickstart
|
|
9
|
+
|
|
10
|
+
> Requires Python 3.11+ and `ffmpeg` for video generation. Use `uv` to run the commands below.
|
|
11
|
+
|
|
12
|
+
## API Keys
|
|
13
|
+
|
|
14
|
+
NAPsack uses a VLM to generate captions. Create a `.env` file in the project root (or export variables in your shell):
|
|
15
|
+
|
|
16
|
+
```shell
|
|
17
|
+
cp .env.example .env
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Then fill in the key for your chosen client:
|
|
21
|
+
|
|
22
|
+
| Client | Variable | Where to get it |
|
|
23
|
+
|--------|----------|-----------------|
|
|
24
|
+
| `gemini` (default) | `GEMINI_API_KEY` | [Google AI Studio](https://aistudio.google.com/apikey) |
|
|
25
|
+
| `vllm` | _(none — pass `--vllm-url`)_ | Self-hosted vLLM server |
|
|
26
|
+
| `bigquery` | _(uses Application Default Credentials)_ | `gcloud auth application-default login` |
|
|
27
|
+
|
|
28
|
+
For Gemini, your `.env` should contain:
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
GEMINI_API_KEY=your_key_here
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Record** a session (press CTRL+C to stop)
|
|
35
|
+
```shell
|
|
36
|
+
uv run -m record --monitor
|
|
37
|
+
```
|
|
38
|
+
**Label** the recorded session
|
|
39
|
+
```shell
|
|
40
|
+
uv run -m label --session logs/session_name --client gemini
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
> NAPsack supports `gemini` and `vllm` for data labeling and integrates with `big query`
|
|
44
|
+
|
|
45
|
+
# Output
|
|
46
|
+
|
|
47
|
+
```shell
|
|
48
|
+
logs/session_name
|
|
49
|
+
├── screenshots # Recorded screenshots
|
|
50
|
+
├── aggregations.jsonl # Recorded event bursts
|
|
51
|
+
├── captions.jsonl # All VLM-generated captions
|
|
52
|
+
├── annotated.mp4 # Final video showing generated captions and input events
|
|
53
|
+
└── data.jsonl # Final data containing raw input events and LLM generated captions
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
# Method
|
|
57
|
+
|
|
58
|
+
## Record
|
|
59
|
+
|
|
60
|
+
NAPsack groups temporally adjacent input events of the same type into **event bursts**. An event is assigned to the current burst if the time since the preceding event of that type does not exceed the corresponding **gap** threshold and the elapsed time since the burst start remains within the **max** duration.
|
|
61
|
+
* If the **gap** threshold is exceeded, a new burst is started.
|
|
62
|
+
* If the **max** duration is exceeded, the first half of the current burst is finalized and saved, while the second half becomes the active burst.
|
|
63
|
+
A burst is force-restarted when the active monitor changes.
|
|
64
|
+
|
|
65
|
+
## Label
|
|
66
|
+
|
|
67
|
+
The `label` module:
|
|
68
|
+
|
|
69
|
+
* Loads sessions or raw screenshots and chunks.
|
|
70
|
+
* Uses prompts (in `label/prompts`) to instruct the VLM to generate captions that describe the user's actions and context.
|
|
71
|
+
* Produces `captions.jsonl` and `data.jsonl` (captions aligned to screenshots and events).
|
|
72
|
+
* Optionally renders an annotated video (`annotated.mp4`) showing captions and event visualizations overlayed on frames.
|
|
73
|
+
|
|
74
|
+
The label step performs a second layer of aggregation: it uses the bursts detected at recording time and further refines and annotates them with VLM outputs to create final human-readable summaries.
|
|
75
|
+
|
|
76
|
+
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "napsack"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
description = "NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened."
|
|
10
|
+
requires-python = ">=3.11,<=3.13"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"mss (>=10.0.0,<11.0.0)",
|
|
13
|
+
"numpy==2.2",
|
|
14
|
+
"opencv-python (>=4.11.0.86,<5.0.0.0)",
|
|
15
|
+
"pandas (>=2.3.0,<3.0.0)",
|
|
16
|
+
"matplotlib (>=3.10.3,<4.0.0)",
|
|
17
|
+
"scikit-learn (>=1.7.0,<2.0.0)",
|
|
18
|
+
"ruptures (>=1.1.9,<2.0.0)",
|
|
19
|
+
"ipdb>=0.13.13,<0.14.0",
|
|
20
|
+
"plotly (>=6.2.0,<7.0.0)",
|
|
21
|
+
"nbformat (>=5.10.4,<6.0.0)",
|
|
22
|
+
"python-dotenv>=1.0.0",
|
|
23
|
+
"google-generativeai>=0.8.5",
|
|
24
|
+
"datasets>=3.0.0,<4.0.0",
|
|
25
|
+
"pillow>=11.3.0",
|
|
26
|
+
"imageio>=2.37.0",
|
|
27
|
+
"screeninfo>=0.8.1",
|
|
28
|
+
"pynput>=1.8.1",
|
|
29
|
+
"scikit-image>=0.25.2",
|
|
30
|
+
"google-genai>=1.45.0",
|
|
31
|
+
"openai>=2.8.1",
|
|
32
|
+
"google-cloud-storage>=3.6.0",
|
|
33
|
+
"google-cloud-bigquery>=3.38.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[tool.uv]
|
|
37
|
+
package = true
|
|
38
|
+
|
|
39
|
+
[tool.setuptools]
|
|
40
|
+
package-dir = {"" = "src"}
|
|
41
|
+
include-package-data = true
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.package-data]
|
|
44
|
+
label = ["prompts/*.txt"]
|
|
45
|
+
|
|
46
|
+
[project.scripts]
|
|
47
|
+
napsack-record = "record.__main__:main"
|
|
48
|
+
napsack-label = "label.__main__:main"
|
|
49
|
+
|
|
50
|
+
[tool.setuptools.packages.find]
|
|
51
|
+
where = ["src"]
|
|
52
|
+
include = ["label*", "record*", "napsack*"]
|
napsack-0.1.0/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import argparse
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
|
|
5
|
+
from label.discovery import discover_sessions, discover_screenshots_sessions, create_single_config
|
|
6
|
+
from label.clients import create_client
|
|
7
|
+
from label.processor import Processor
|
|
8
|
+
from label.visualizer import Visualizer
|
|
9
|
+
|
|
10
|
+
load_dotenv()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_args():
|
|
14
|
+
p = argparse.ArgumentParser(description="Process session recordings with VLM")
|
|
15
|
+
|
|
16
|
+
session_group = p.add_mutually_exclusive_group(required=True)
|
|
17
|
+
session_group.add_argument("--session", type=Path)
|
|
18
|
+
session_group.add_argument("--sessions-root", type=Path)
|
|
19
|
+
|
|
20
|
+
p.add_argument("--chunk-duration", type=int, default=60, help="Chunk duration in seconds")
|
|
21
|
+
p.add_argument("--fps", type=int, default=1, help="Frames per second for video processing")
|
|
22
|
+
|
|
23
|
+
p.add_argument("--screenshots-only", action="store_true", help="Process screenshots folder only without aggregations or annotations")
|
|
24
|
+
p.add_argument("--image-extensions", nargs="+", default=[".jpg", ".jpeg", ".png"], help="Image file extensions to consider")
|
|
25
|
+
p.add_argument("--max-time-gap", type=float, default=300.0, help="Maximum time gap (seconds) between images before forcing a video split (default: 120 = 2 minutes)")
|
|
26
|
+
p.add_argument("--prompt-file", default=None, help="Path to prompt file (default: prompts/default.txt or prompts/screenshots_only.txt if screenshots only)")
|
|
27
|
+
p.add_argument("--hash-cache", type=str, default=None, help="Path to hash_cache.json for deduplicating consecutive similar images")
|
|
28
|
+
p.add_argument("--dedupe-threshold", type=int, default=1, help="Hamming distance threshold for deduplication (drop if <= threshold, default: 1)")
|
|
29
|
+
p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)")
|
|
30
|
+
p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed")
|
|
31
|
+
p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing")
|
|
32
|
+
p.add_argument("--encode-only", action="store_true", help="Only encode videos (create chunks), skip labeling. Useful for pre-processing before running the full pipeline.")
|
|
33
|
+
|
|
34
|
+
p.add_argument("--client", choices=["gemini", "vllm", "bigquery"], default="gemini")
|
|
35
|
+
p.add_argument("--model", default="")
|
|
36
|
+
p.add_argument("--encode-workers", type=int, default=8, help="Number of parallel workers for video encoding")
|
|
37
|
+
p.add_argument("--label-workers", type=int, default=4, help="Number of parallel workers for VLM labeling")
|
|
38
|
+
|
|
39
|
+
vllm_group = p.add_argument_group("vLLM Options")
|
|
40
|
+
vllm_group.add_argument("--vllm-url")
|
|
41
|
+
|
|
42
|
+
bq_group = p.add_argument_group("BigQuery Options")
|
|
43
|
+
bq_group.add_argument("--bq-project", help="GCP project ID for AI Platform endpoint")
|
|
44
|
+
bq_group.add_argument("--bq-bucket-name", help="GCS bucket name for uploading videos")
|
|
45
|
+
bq_group.add_argument("--bq-gcs-prefix", default="video_chunks", help="Prefix/folder path in GCS bucket")
|
|
46
|
+
bq_group.add_argument("--bq-object-table-location", default="us", help="Object table location (e.g., 'us' or 'us.screenomics-gemini')")
|
|
47
|
+
|
|
48
|
+
args = p.parse_args()
|
|
49
|
+
|
|
50
|
+
if not args.model:
|
|
51
|
+
if args.client == 'gemini':
|
|
52
|
+
args.model = 'gemini-3-flash-preview'
|
|
53
|
+
elif args.client == 'vllm':
|
|
54
|
+
args.model = 'Qwen/Qwen3-VL-8B-Thinking-FP8'
|
|
55
|
+
elif args.client == 'bigquery':
|
|
56
|
+
args.model = 'dataset.model' # Placeholder - user must provide full model reference
|
|
57
|
+
if not args.prompt_file:
|
|
58
|
+
args.prompt_file = "prompts/screenshots_only.txt" if args.screenshots_only else "prompts/default.txt"
|
|
59
|
+
|
|
60
|
+
return args
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def setup_configs(args):
|
|
64
|
+
if args.session:
|
|
65
|
+
configs = [create_single_config(
|
|
66
|
+
args.session,
|
|
67
|
+
args.chunk_duration,
|
|
68
|
+
args.screenshots_only,
|
|
69
|
+
tuple(args.image_extensions),
|
|
70
|
+
)]
|
|
71
|
+
else:
|
|
72
|
+
if args.screenshots_only:
|
|
73
|
+
configs = discover_screenshots_sessions(
|
|
74
|
+
args.sessions_root,
|
|
75
|
+
args.chunk_duration,
|
|
76
|
+
tuple(args.image_extensions),
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
configs = discover_sessions(
|
|
80
|
+
args.sessions_root,
|
|
81
|
+
args.chunk_duration,
|
|
82
|
+
args.skip_existing,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if not configs:
|
|
86
|
+
print(f"No sessions found in {args.sessions_root}")
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
return configs
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def process_with_gemini(args, configs):
|
|
93
|
+
client = create_client(
|
|
94
|
+
'gemini',
|
|
95
|
+
model_name=args.model,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
processor = Processor(
|
|
99
|
+
client=client,
|
|
100
|
+
encode_workers=args.encode_workers,
|
|
101
|
+
label_workers=args.label_workers,
|
|
102
|
+
screenshots_only=args.screenshots_only,
|
|
103
|
+
prompt_file=args.prompt_file,
|
|
104
|
+
max_time_gap=args.max_time_gap,
|
|
105
|
+
hash_cache_path=args.hash_cache,
|
|
106
|
+
dedupe_threshold=args.dedupe_threshold,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return processor.process_sessions(
|
|
110
|
+
configs,
|
|
111
|
+
fps=args.fps,
|
|
112
|
+
annotate=args.annotate and not args.screenshots_only,
|
|
113
|
+
encode_only=args.encode_only,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def process_with_vllm(args, configs):
|
|
118
|
+
client = create_client(
|
|
119
|
+
'vllm',
|
|
120
|
+
base_url=args.vllm_url if args.vllm_url.endswith('/v1') else f"{args.vllm_url}/v1",
|
|
121
|
+
model_name=args.model
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
processor = Processor(
|
|
125
|
+
client=client,
|
|
126
|
+
encode_workers=args.encode_workers,
|
|
127
|
+
label_workers=args.label_workers,
|
|
128
|
+
screenshots_only=args.screenshots_only,
|
|
129
|
+
prompt_file=args.prompt_file,
|
|
130
|
+
max_time_gap=args.max_time_gap,
|
|
131
|
+
hash_cache_path=args.hash_cache,
|
|
132
|
+
dedupe_threshold=args.dedupe_threshold,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return processor.process_sessions(
|
|
136
|
+
configs,
|
|
137
|
+
fps=args.fps,
|
|
138
|
+
annotate=args.annotate and not args.screenshots_only,
|
|
139
|
+
encode_only=args.encode_only,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def process_with_bigquery(args, configs):
|
|
144
|
+
client = create_client(
|
|
145
|
+
'bigquery',
|
|
146
|
+
model_name=args.model,
|
|
147
|
+
bucket_name=args.bq_bucket_name,
|
|
148
|
+
gcs_prefix=args.bq_gcs_prefix,
|
|
149
|
+
object_table_location=args.bq_object_table_location,
|
|
150
|
+
project_id=args.bq_project,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
processor = Processor(
|
|
154
|
+
client=client,
|
|
155
|
+
encode_workers=args.encode_workers,
|
|
156
|
+
label_workers=args.label_workers,
|
|
157
|
+
screenshots_only=args.screenshots_only,
|
|
158
|
+
prompt_file=args.prompt_file,
|
|
159
|
+
max_time_gap=args.max_time_gap,
|
|
160
|
+
hash_cache_path=args.hash_cache,
|
|
161
|
+
dedupe_threshold=args.dedupe_threshold,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return processor.process_sessions(
|
|
165
|
+
configs,
|
|
166
|
+
fps=args.fps,
|
|
167
|
+
annotate=args.annotate and not args.screenshots_only,
|
|
168
|
+
encode_only=args.encode_only,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def main():
|
|
173
|
+
args = parse_args()
|
|
174
|
+
|
|
175
|
+
configs = setup_configs(args)
|
|
176
|
+
if not configs:
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
print(f"Processing {len(configs)} sessions")
|
|
180
|
+
|
|
181
|
+
if args.client == 'gemini':
|
|
182
|
+
results = process_with_gemini(args, configs)
|
|
183
|
+
elif args.client == 'vllm':
|
|
184
|
+
results = process_with_vllm(args, configs)
|
|
185
|
+
elif args.client == 'bigquery':
|
|
186
|
+
results = process_with_bigquery(args, configs)
|
|
187
|
+
else:
|
|
188
|
+
raise ValueError(f"Unknown client: {args.client}")
|
|
189
|
+
|
|
190
|
+
print(f"✓ Processed {len(results)} sessions")
|
|
191
|
+
|
|
192
|
+
if args.visualize:
|
|
193
|
+
print("\nCreating visualizations...")
|
|
194
|
+
visualizer = Visualizer(args.annotate)
|
|
195
|
+
|
|
196
|
+
for config in configs:
|
|
197
|
+
if not config.matched_captions_jsonl.exists():
|
|
198
|
+
print(f"Skipping Visualizing {config.session_id}: no data.jsonl")
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
output = config.session_folder / "annotated.mp4"
|
|
203
|
+
visualizer.visualize(config.session_folder, output, args.fps)
|
|
204
|
+
print(f"✓ {config.session_id}: {output}")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
print(f"✗ {config.session_id}: {e}")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
if __name__ == '__main__':
|
|
210
|
+
main()
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import List, Dict, Any, Optional
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def match_captions_with_events(
|
|
7
|
+
captions_path: Path,
|
|
8
|
+
aggregations_path: Path,
|
|
9
|
+
output_path: Path,
|
|
10
|
+
fps: int = 1
|
|
11
|
+
) -> List[Dict[str, Any]]:
|
|
12
|
+
"""
|
|
13
|
+
Match captions with aggregated events based on timestamps.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
captions_path: Path to captions.jsonl
|
|
17
|
+
aggregations_path: Path to aggregations.jsonl
|
|
18
|
+
output_path: Path to save matched_captions.jsonl
|
|
19
|
+
fps: Frames per second used in video creation
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
List of matched caption-event objects
|
|
23
|
+
"""
|
|
24
|
+
# Load captions
|
|
25
|
+
captions = []
|
|
26
|
+
with open(captions_path, 'r', encoding='utf-8') as f:
|
|
27
|
+
for line in f:
|
|
28
|
+
if line.strip():
|
|
29
|
+
captions.append(json.loads(line))
|
|
30
|
+
|
|
31
|
+
# Load aggregations
|
|
32
|
+
aggregations = []
|
|
33
|
+
with open(aggregations_path, 'r', encoding='utf-8') as f:
|
|
34
|
+
for line in f:
|
|
35
|
+
if line.strip():
|
|
36
|
+
agg = json.loads(line)
|
|
37
|
+
aggregations.append(agg)
|
|
38
|
+
|
|
39
|
+
# Sort aggregations by timestamp
|
|
40
|
+
aggregations.sort(key=lambda x: x.get('timestamp', 0))
|
|
41
|
+
|
|
42
|
+
if not aggregations:
|
|
43
|
+
print("[Matcher] Warning: No aggregations found")
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
# Get first aggregation timestamp (video start time)
|
|
47
|
+
first_timestamp = aggregations[0].get('timestamp', 0)
|
|
48
|
+
|
|
49
|
+
print(f"[Matcher] Video start time: {first_timestamp}")
|
|
50
|
+
print(f"[Matcher] Total aggregations: {len(aggregations)}")
|
|
51
|
+
print(f"[Matcher] FPS: {fps}")
|
|
52
|
+
|
|
53
|
+
# Match captions with events
|
|
54
|
+
matched_data = []
|
|
55
|
+
|
|
56
|
+
for caption in captions:
|
|
57
|
+
# Convert MM:SS to seconds
|
|
58
|
+
start_seconds = caption['start_seconds']
|
|
59
|
+
end_seconds = caption['end_seconds']
|
|
60
|
+
|
|
61
|
+
# Convert video time to aggregation indices
|
|
62
|
+
# Each aggregation represents 1 frame, so index = seconds * fps
|
|
63
|
+
start_index = int(start_seconds * fps)
|
|
64
|
+
end_index = int(end_seconds * fps)
|
|
65
|
+
|
|
66
|
+
# Clamp to valid range
|
|
67
|
+
start_index = max(0, min(start_index, len(aggregations) - 1))
|
|
68
|
+
end_index = max(start_index, min(end_index, len(aggregations) - 1))
|
|
69
|
+
|
|
70
|
+
print(f"[Matcher] Caption '{caption['caption'][:50]}...' -> indices [{start_index}, {end_index}]")
|
|
71
|
+
|
|
72
|
+
# Get aggregations in this range
|
|
73
|
+
matched_aggs = aggregations[start_index:end_index + 1]
|
|
74
|
+
|
|
75
|
+
if not matched_aggs:
|
|
76
|
+
# No events matched, but still save the caption
|
|
77
|
+
matched_entry = {
|
|
78
|
+
"start_time": first_timestamp + start_seconds,
|
|
79
|
+
"end_time": first_timestamp + end_seconds,
|
|
80
|
+
"start_index": start_index,
|
|
81
|
+
"end_index": end_index,
|
|
82
|
+
"img": None,
|
|
83
|
+
"caption": caption['caption'],
|
|
84
|
+
"raw_events": [],
|
|
85
|
+
"num_aggregations": 0,
|
|
86
|
+
"start_formatted": caption['start'],
|
|
87
|
+
"end_formatted": caption['end'],
|
|
88
|
+
}
|
|
89
|
+
else:
|
|
90
|
+
# Get first and last aggregation for time and image
|
|
91
|
+
first_agg = matched_aggs[0]
|
|
92
|
+
last_agg = matched_aggs[-1]
|
|
93
|
+
|
|
94
|
+
# Concatenate all events from matched aggregations
|
|
95
|
+
all_events = []
|
|
96
|
+
for agg in matched_aggs:
|
|
97
|
+
events = agg.get('events', [])
|
|
98
|
+
all_events.extend(events)
|
|
99
|
+
|
|
100
|
+
matched_entry = {
|
|
101
|
+
"start_time": first_agg.get('timestamp'),
|
|
102
|
+
"end_time": last_agg.get('timestamp'),
|
|
103
|
+
"start_index": start_index,
|
|
104
|
+
"end_index": end_index,
|
|
105
|
+
"img": first_agg.get('screenshot_path'),
|
|
106
|
+
"caption": caption['caption'],
|
|
107
|
+
"raw_events": all_events,
|
|
108
|
+
"num_aggregations": len(matched_aggs),
|
|
109
|
+
"start_formatted": caption['start'],
|
|
110
|
+
"end_formatted": caption['end'],
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
matched_data.append(matched_entry)
|
|
114
|
+
|
|
115
|
+
# Save matched data
|
|
116
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
117
|
+
for entry in matched_data:
|
|
118
|
+
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
|
119
|
+
|
|
120
|
+
print(f"[Matcher] Saved {len(matched_data)} matched entries to {output_path}")
|
|
121
|
+
|
|
122
|
+
return matched_data
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def create_matched_captions_for_session(session_dir: Path, fps: int = 1) -> Optional[Path]:
|
|
126
|
+
"""
|
|
127
|
+
Create matched_captions.jsonl for a session directory.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
session_dir: Path to session directory
|
|
131
|
+
fps: Frames per second used in video creation
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Path to created matched_captions.jsonl or None if failed
|
|
135
|
+
"""
|
|
136
|
+
captions_path = session_dir / "captions.jsonl"
|
|
137
|
+
aggregations_path = session_dir / "aggregations.jsonl"
|
|
138
|
+
output_path = session_dir / "matched_captions.jsonl"
|
|
139
|
+
|
|
140
|
+
if not captions_path.exists():
|
|
141
|
+
print(f"[Matcher] Warning: {captions_path} not found")
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
if not aggregations_path.exists():
|
|
145
|
+
print(f"[Matcher] Warning: {aggregations_path} not found")
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
match_captions_with_events(captions_path, aggregations_path, output_path, fps)
|
|
149
|
+
|
|
150
|
+
return output_path
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from label.clients.client import VLMClient, CAPTION_SCHEMA
|
|
2
|
+
from label.clients.gemini import GeminiClient, GeminiResponse
|
|
3
|
+
from label.clients.vllm import VLLMClient, VLLMResponse
|
|
4
|
+
from label.clients.bigquery import BigQueryClient, BigQueryResponse
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_client(client_type: str, **kwargs) -> VLMClient:
|
|
8
|
+
if client_type == 'gemini':
|
|
9
|
+
return GeminiClient(**kwargs)
|
|
10
|
+
elif client_type == 'vllm':
|
|
11
|
+
return VLLMClient(**kwargs)
|
|
12
|
+
elif client_type == 'bigquery':
|
|
13
|
+
return BigQueryClient(**kwargs)
|
|
14
|
+
else:
|
|
15
|
+
raise ValueError(f"Unknown client type: {client_type}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"VLMClient",
|
|
20
|
+
"GeminiClient",
|
|
21
|
+
"GeminiResponse",
|
|
22
|
+
"VLLMClient",
|
|
23
|
+
"VLLMResponse",
|
|
24
|
+
"BigQueryClient",
|
|
25
|
+
"BigQueryResponse",
|
|
26
|
+
"CAPTION_SCHEMA",
|
|
27
|
+
"create_client",
|
|
28
|
+
]
|