napsack 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. napsack-0.1.0/PKG-INFO +105 -0
  2. napsack-0.1.0/README.md +76 -0
  3. napsack-0.1.0/pyproject.toml +52 -0
  4. napsack-0.1.0/setup.cfg +4 -0
  5. napsack-0.1.0/src/label/__init__.py +0 -0
  6. napsack-0.1.0/src/label/__main__.py +210 -0
  7. napsack-0.1.0/src/label/caption_matching.py +150 -0
  8. napsack-0.1.0/src/label/clients/__init__.py +28 -0
  9. napsack-0.1.0/src/label/clients/bigquery.py +171 -0
  10. napsack-0.1.0/src/label/clients/client.py +28 -0
  11. napsack-0.1.0/src/label/clients/gemini.py +84 -0
  12. napsack-0.1.0/src/label/clients/vllm.py +137 -0
  13. napsack-0.1.0/src/label/discovery.py +116 -0
  14. napsack-0.1.0/src/label/models.py +533 -0
  15. napsack-0.1.0/src/label/processor.py +642 -0
  16. napsack-0.1.0/src/label/prompts/default.txt +98 -0
  17. napsack-0.1.0/src/label/prompts/screenshots_only.txt +87 -0
  18. napsack-0.1.0/src/label/video.py +323 -0
  19. napsack-0.1.0/src/label/visualizer.py +280 -0
  20. napsack-0.1.0/src/napsack/__init__.py +2 -0
  21. napsack-0.1.0/src/napsack.egg-info/PKG-INFO +105 -0
  22. napsack-0.1.0/src/napsack.egg-info/SOURCES.txt +46 -0
  23. napsack-0.1.0/src/napsack.egg-info/dependency_links.txt +1 -0
  24. napsack-0.1.0/src/napsack.egg-info/entry_points.txt +3 -0
  25. napsack-0.1.0/src/napsack.egg-info/requires.txt +22 -0
  26. napsack-0.1.0/src/napsack.egg-info/top_level.txt +3 -0
  27. napsack-0.1.0/src/record/__init__.py +0 -0
  28. napsack-0.1.0/src/record/__main__.py +413 -0
  29. napsack-0.1.0/src/record/constants.py +112 -0
  30. napsack-0.1.0/src/record/handlers/__init__.py +7 -0
  31. napsack-0.1.0/src/record/handlers/accessibility.py +227 -0
  32. napsack-0.1.0/src/record/handlers/input_event.py +269 -0
  33. napsack-0.1.0/src/record/handlers/screenshot.py +87 -0
  34. napsack-0.1.0/src/record/models/__init__.py +16 -0
  35. napsack-0.1.0/src/record/models/aggregation.py +51 -0
  36. napsack-0.1.0/src/record/models/event.py +35 -0
  37. napsack-0.1.0/src/record/models/event_queue.py +503 -0
  38. napsack-0.1.0/src/record/models/image.py +23 -0
  39. napsack-0.1.0/src/record/models/image_queue.py +118 -0
  40. napsack-0.1.0/src/record/monitor/__init__.py +9 -0
  41. napsack-0.1.0/src/record/monitor/reader.py +101 -0
  42. napsack-0.1.0/src/record/monitor/summary.py +402 -0
  43. napsack-0.1.0/src/record/monitor/viewer.py +393 -0
  44. napsack-0.1.0/src/record/sanitize.py +224 -0
  45. napsack-0.1.0/src/record/workers/__init__.py +10 -0
  46. napsack-0.1.0/src/record/workers/aggregation.py +157 -0
  47. napsack-0.1.0/src/record/workers/save.py +104 -0
  48. napsack-0.1.0/src/record/workers/screenshot.py +136 -0
napsack-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: napsack
3
+ Version: 0.1.0
4
+ Summary: NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened.
5
+ Requires-Python: <=3.13,>=3.11
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: mss<11.0.0,>=10.0.0
8
+ Requires-Dist: numpy==2.2
9
+ Requires-Dist: opencv-python<5.0.0.0,>=4.11.0.86
10
+ Requires-Dist: pandas<3.0.0,>=2.3.0
11
+ Requires-Dist: matplotlib<4.0.0,>=3.10.3
12
+ Requires-Dist: scikit-learn<2.0.0,>=1.7.0
13
+ Requires-Dist: ruptures<2.0.0,>=1.1.9
14
+ Requires-Dist: ipdb<0.14.0,>=0.13.13
15
+ Requires-Dist: plotly<7.0.0,>=6.2.0
16
+ Requires-Dist: nbformat<6.0.0,>=5.10.4
17
+ Requires-Dist: python-dotenv>=1.0.0
18
+ Requires-Dist: google-generativeai>=0.8.5
19
+ Requires-Dist: datasets<4.0.0,>=3.0.0
20
+ Requires-Dist: pillow>=11.3.0
21
+ Requires-Dist: imageio>=2.37.0
22
+ Requires-Dist: screeninfo>=0.8.1
23
+ Requires-Dist: pynput>=1.8.1
24
+ Requires-Dist: scikit-image>=0.25.2
25
+ Requires-Dist: google-genai>=1.45.0
26
+ Requires-Dist: openai>=2.8.1
27
+ Requires-Dist: google-cloud-storage>=3.6.0
28
+ Requires-Dist: google-cloud-bigquery>=3.38.0
29
+
30
+ # NAPsack
31
+
32
+ **NAPsack** records and structures your computer use by generating natural language caption from screenshots and input events (click, keypress, scroll, cursor move).
33
+
34
+ <img alt="napsack_overview" src="https://github.com/user-attachments/assets/dd9ca2c5-288c-4977-8dc9-10ca343e56db" />
35
+
36
+ ---
37
+ # Quickstart
38
+
39
+ > Requires Python 3.11+ and `ffmpeg` for video generation. Use `uv` to run the commands below.
40
+
41
+ ## API Keys
42
+
43
+ NAPsack uses a VLM to generate captions. Create a `.env` file in the project root (or export variables in your shell):
44
+
45
+ ```shell
46
+ cp .env.example .env
47
+ ```
48
+
49
+ Then fill in the key for your chosen client:
50
+
51
+ | Client | Variable | Where to get it |
52
+ |--------|----------|-----------------|
53
+ | `gemini` (default) | `GEMINI_API_KEY` | [Google AI Studio](https://aistudio.google.com/apikey) |
54
+ | `vllm` | _(none — pass `--vllm-url`)_ | Self-hosted vLLM server |
55
+ | `bigquery` | _(uses Application Default Credentials)_ | `gcloud auth application-default login` |
56
+
57
+ For Gemini, your `.env` should contain:
58
+
59
+ ```
60
+ GEMINI_API_KEY=your_key_here
61
+ ```
62
+
63
+ **Record** a session (press CTRL+C to stop)
64
+ ```shell
65
+ uv run -m record --monitor
66
+ ```
67
+ **Label** the recorded session
68
+ ```shell
69
+ uv run -m label --session logs/session_name --client gemini
70
+ ```
71
+
72
+ > NAPsack supports `gemini` and `vllm` for data labeling and integrates with `big query`
73
+
74
+ # Output
75
+
76
+ ```shell
77
+ logs/session_name
78
+ ├── screenshots # Recorded screenshots
79
+ ├── aggregations.jsonl # Recorded event bursts
80
+ ├── captions.jsonl # All VLM-generated captions
81
+ ├── annotated.mp4 # Final video showing generated captions and input events
82
+ └── data.jsonl # Final data containing raw input events and LLM generated captions
83
+ ```
84
+
85
+ # Method
86
+
87
+ ## Record
88
+
89
+ NAPsack groups temporally adjacent input events of the same type into **event bursts**. An event is assigned to the current burst if the time since the preceding event of that type does not exceed the corresponding **gap** threshold and the elapsed time since the burst start remains within the **max** duration.
90
+ * If the **gap** threshold is exceeded, a new burst is started.
91
+ * If the **max** duration is exceeded, the first half of the current burst is finalized and saved, while the second half becomes the active burst.
92
+ A burst is force-restarted when the active monitor changes.
93
+
94
+ ## Label
95
+
96
+ The `label` module:
97
+
98
+ * Loads sessions or raw screenshots and chunks.
99
+ * Uses prompts (in `label/prompts`) to instruct the VLM to generate captions that describe the user's actions and context.
100
+ * Produces `captions.jsonl` and `data.jsonl` (captions aligned to screenshots and events).
101
+ * Optionally renders an annotated video (`annotated.mp4`) showing captions and event visualizations overlayed on frames.
102
+
103
+ The label step performs a second layer of aggregation: it uses the bursts detected at recording time and further refines and annotates them with VLM outputs to create final human-readable summaries.
104
+
105
+
@@ -0,0 +1,76 @@
1
+ # NAPsack
2
+
3
+ **NAPsack** records and structures your computer use by generating natural language caption from screenshots and input events (click, keypress, scroll, cursor move).
4
+
5
+ <img alt="napsack_overview" src="https://github.com/user-attachments/assets/dd9ca2c5-288c-4977-8dc9-10ca343e56db" />
6
+
7
+ ---
8
+ # Quickstart
9
+
10
+ > Requires Python 3.11+ and `ffmpeg` for video generation. Use `uv` to run the commands below.
11
+
12
+ ## API Keys
13
+
14
+ NAPsack uses a VLM to generate captions. Create a `.env` file in the project root (or export variables in your shell):
15
+
16
+ ```shell
17
+ cp .env.example .env
18
+ ```
19
+
20
+ Then fill in the key for your chosen client:
21
+
22
+ | Client | Variable | Where to get it |
23
+ |--------|----------|-----------------|
24
+ | `gemini` (default) | `GEMINI_API_KEY` | [Google AI Studio](https://aistudio.google.com/apikey) |
25
+ | `vllm` | _(none — pass `--vllm-url`)_ | Self-hosted vLLM server |
26
+ | `bigquery` | _(uses Application Default Credentials)_ | `gcloud auth application-default login` |
27
+
28
+ For Gemini, your `.env` should contain:
29
+
30
+ ```
31
+ GEMINI_API_KEY=your_key_here
32
+ ```
33
+
34
+ **Record** a session (press CTRL+C to stop)
35
+ ```shell
36
+ uv run -m record --monitor
37
+ ```
38
+ **Label** the recorded session
39
+ ```shell
40
+ uv run -m label --session logs/session_name --client gemini
41
+ ```
42
+
43
+ > NAPsack supports `gemini` and `vllm` for data labeling and integrates with `big query`
44
+
45
+ # Output
46
+
47
+ ```shell
48
+ logs/session_name
49
+ ├── screenshots # Recorded screenshots
50
+ ├── aggregations.jsonl # Recorded event bursts
51
+ ├── captions.jsonl # All VLM-generated captions
52
+ ├── annotated.mp4 # Final video showing generated captions and input events
53
+ └── data.jsonl # Final data containing raw input events and LLM generated captions
54
+ ```
55
+
56
+ # Method
57
+
58
+ ## Record
59
+
60
+ NAPsack groups temporally adjacent input events of the same type into **event bursts**. An event is assigned to the current burst if the time since the preceding event of that type does not exceed the corresponding **gap** threshold and the elapsed time since the burst start remains within the **max** duration.
61
+ * If the **gap** threshold is exceeded, a new burst is started.
62
+ * If the **max** duration is exceeded, the first half of the current burst is finalized and saved, while the second half becomes the active burst.
63
+ A burst is force-restarted when the active monitor changes.
64
+
65
+ ## Label
66
+
67
+ The `label` module:
68
+
69
+ * Loads sessions or raw screenshots and chunks.
70
+ * Uses prompts (in `label/prompts`) to instruct the VLM to generate captions that describe the user's actions and context.
71
+ * Produces `captions.jsonl` and `data.jsonl` (captions aligned to screenshots and events).
72
+ * Optionally renders an annotated video (`annotated.mp4`) showing captions and event visualizations overlayed on frames.
73
+
74
+ The label step performs a second layer of aggregation: it uses the bursts detected at recording time and further refines and annotates them with VLM outputs to create final human-readable summaries.
75
+
76
+
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "napsack"
7
+ version = "0.1.0"
8
+ readme = "README.md"
9
+ description = "NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened."
10
+ requires-python = ">=3.11,<=3.13"
11
+ dependencies = [
12
+ "mss (>=10.0.0,<11.0.0)",
13
+ "numpy==2.2",
14
+ "opencv-python (>=4.11.0.86,<5.0.0.0)",
15
+ "pandas (>=2.3.0,<3.0.0)",
16
+ "matplotlib (>=3.10.3,<4.0.0)",
17
+ "scikit-learn (>=1.7.0,<2.0.0)",
18
+ "ruptures (>=1.1.9,<2.0.0)",
19
+ "ipdb>=0.13.13,<0.14.0",
20
+ "plotly (>=6.2.0,<7.0.0)",
21
+ "nbformat (>=5.10.4,<6.0.0)",
22
+ "python-dotenv>=1.0.0",
23
+ "google-generativeai>=0.8.5",
24
+ "datasets>=3.0.0,<4.0.0",
25
+ "pillow>=11.3.0",
26
+ "imageio>=2.37.0",
27
+ "screeninfo>=0.8.1",
28
+ "pynput>=1.8.1",
29
+ "scikit-image>=0.25.2",
30
+ "google-genai>=1.45.0",
31
+ "openai>=2.8.1",
32
+ "google-cloud-storage>=3.6.0",
33
+ "google-cloud-bigquery>=3.38.0",
34
+ ]
35
+
36
+ [tool.uv]
37
+ package = true
38
+
39
+ [tool.setuptools]
40
+ package-dir = {"" = "src"}
41
+ include-package-data = true
42
+
43
+ [tool.setuptools.package-data]
44
+ label = ["prompts/*.txt"]
45
+
46
+ [project.scripts]
47
+ napsack-record = "record.__main__:main"
48
+ napsack-label = "label.__main__:main"
49
+
50
+ [tool.setuptools.packages.find]
51
+ where = ["src"]
52
+ include = ["label*", "record*", "napsack*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,210 @@
1
+ from pathlib import Path
2
+ import argparse
3
+ from dotenv import load_dotenv
4
+
5
+ from label.discovery import discover_sessions, discover_screenshots_sessions, create_single_config
6
+ from label.clients import create_client
7
+ from label.processor import Processor
8
+ from label.visualizer import Visualizer
9
+
10
+ load_dotenv()
11
+
12
+
13
+ def parse_args():
14
+ p = argparse.ArgumentParser(description="Process session recordings with VLM")
15
+
16
+ session_group = p.add_mutually_exclusive_group(required=True)
17
+ session_group.add_argument("--session", type=Path)
18
+ session_group.add_argument("--sessions-root", type=Path)
19
+
20
+ p.add_argument("--chunk-duration", type=int, default=60, help="Chunk duration in seconds")
21
+ p.add_argument("--fps", type=int, default=1, help="Frames per second for video processing")
22
+
23
+ p.add_argument("--screenshots-only", action="store_true", help="Process screenshots folder only without aggregations or annotations")
24
+ p.add_argument("--image-extensions", nargs="+", default=[".jpg", ".jpeg", ".png"], help="Image file extensions to consider")
25
+ p.add_argument("--max-time-gap", type=float, default=300.0, help="Maximum time gap (seconds) between images before forcing a video split (default: 120 = 2 minutes)")
26
+ p.add_argument("--prompt-file", default=None, help="Path to prompt file (default: prompts/default.txt or prompts/screenshots_only.txt if screenshots only)")
27
+ p.add_argument("--hash-cache", type=str, default=None, help="Path to hash_cache.json for deduplicating consecutive similar images")
28
+ p.add_argument("--dedupe-threshold", type=int, default=1, help="Hamming distance threshold for deduplication (drop if <= threshold, default: 1)")
29
+ p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)")
30
+ p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed")
31
+ p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing")
32
+ p.add_argument("--encode-only", action="store_true", help="Only encode videos (create chunks), skip labeling. Useful for pre-processing before running the full pipeline.")
33
+
34
+ p.add_argument("--client", choices=["gemini", "vllm", "bigquery"], default="gemini")
35
+ p.add_argument("--model", default="")
36
+ p.add_argument("--encode-workers", type=int, default=8, help="Number of parallel workers for video encoding")
37
+ p.add_argument("--label-workers", type=int, default=4, help="Number of parallel workers for VLM labeling")
38
+
39
+ vllm_group = p.add_argument_group("vLLM Options")
40
+ vllm_group.add_argument("--vllm-url")
41
+
42
+ bq_group = p.add_argument_group("BigQuery Options")
43
+ bq_group.add_argument("--bq-project", help="GCP project ID for AI Platform endpoint")
44
+ bq_group.add_argument("--bq-bucket-name", help="GCS bucket name for uploading videos")
45
+ bq_group.add_argument("--bq-gcs-prefix", default="video_chunks", help="Prefix/folder path in GCS bucket")
46
+ bq_group.add_argument("--bq-object-table-location", default="us", help="Object table location (e.g., 'us' or 'us.screenomics-gemini')")
47
+
48
+ args = p.parse_args()
49
+
50
+ if not args.model:
51
+ if args.client == 'gemini':
52
+ args.model = 'gemini-3-flash-preview'
53
+ elif args.client == 'vllm':
54
+ args.model = 'Qwen/Qwen3-VL-8B-Thinking-FP8'
55
+ elif args.client == 'bigquery':
56
+ args.model = 'dataset.model' # Placeholder - user must provide full model reference
57
+ if not args.prompt_file:
58
+ args.prompt_file = "prompts/screenshots_only.txt" if args.screenshots_only else "prompts/default.txt"
59
+
60
+ return args
61
+
62
+
63
+ def setup_configs(args):
64
+ if args.session:
65
+ configs = [create_single_config(
66
+ args.session,
67
+ args.chunk_duration,
68
+ args.screenshots_only,
69
+ tuple(args.image_extensions),
70
+ )]
71
+ else:
72
+ if args.screenshots_only:
73
+ configs = discover_screenshots_sessions(
74
+ args.sessions_root,
75
+ args.chunk_duration,
76
+ tuple(args.image_extensions),
77
+ )
78
+ else:
79
+ configs = discover_sessions(
80
+ args.sessions_root,
81
+ args.chunk_duration,
82
+ args.skip_existing,
83
+ )
84
+
85
+ if not configs:
86
+ print(f"No sessions found in {args.sessions_root}")
87
+ return []
88
+
89
+ return configs
90
+
91
+
92
+ def process_with_gemini(args, configs):
93
+ client = create_client(
94
+ 'gemini',
95
+ model_name=args.model,
96
+ )
97
+
98
+ processor = Processor(
99
+ client=client,
100
+ encode_workers=args.encode_workers,
101
+ label_workers=args.label_workers,
102
+ screenshots_only=args.screenshots_only,
103
+ prompt_file=args.prompt_file,
104
+ max_time_gap=args.max_time_gap,
105
+ hash_cache_path=args.hash_cache,
106
+ dedupe_threshold=args.dedupe_threshold,
107
+ )
108
+
109
+ return processor.process_sessions(
110
+ configs,
111
+ fps=args.fps,
112
+ annotate=args.annotate and not args.screenshots_only,
113
+ encode_only=args.encode_only,
114
+ )
115
+
116
+
117
+ def process_with_vllm(args, configs):
118
+ client = create_client(
119
+ 'vllm',
120
+ base_url=args.vllm_url if args.vllm_url.endswith('/v1') else f"{args.vllm_url}/v1",
121
+ model_name=args.model
122
+ )
123
+
124
+ processor = Processor(
125
+ client=client,
126
+ encode_workers=args.encode_workers,
127
+ label_workers=args.label_workers,
128
+ screenshots_only=args.screenshots_only,
129
+ prompt_file=args.prompt_file,
130
+ max_time_gap=args.max_time_gap,
131
+ hash_cache_path=args.hash_cache,
132
+ dedupe_threshold=args.dedupe_threshold,
133
+ )
134
+
135
+ return processor.process_sessions(
136
+ configs,
137
+ fps=args.fps,
138
+ annotate=args.annotate and not args.screenshots_only,
139
+ encode_only=args.encode_only,
140
+ )
141
+
142
+
143
+ def process_with_bigquery(args, configs):
144
+ client = create_client(
145
+ 'bigquery',
146
+ model_name=args.model,
147
+ bucket_name=args.bq_bucket_name,
148
+ gcs_prefix=args.bq_gcs_prefix,
149
+ object_table_location=args.bq_object_table_location,
150
+ project_id=args.bq_project,
151
+ )
152
+
153
+ processor = Processor(
154
+ client=client,
155
+ encode_workers=args.encode_workers,
156
+ label_workers=args.label_workers,
157
+ screenshots_only=args.screenshots_only,
158
+ prompt_file=args.prompt_file,
159
+ max_time_gap=args.max_time_gap,
160
+ hash_cache_path=args.hash_cache,
161
+ dedupe_threshold=args.dedupe_threshold,
162
+ )
163
+
164
+ return processor.process_sessions(
165
+ configs,
166
+ fps=args.fps,
167
+ annotate=args.annotate and not args.screenshots_only,
168
+ encode_only=args.encode_only,
169
+ )
170
+
171
+
172
+ def main():
173
+ args = parse_args()
174
+
175
+ configs = setup_configs(args)
176
+ if not configs:
177
+ return
178
+
179
+ print(f"Processing {len(configs)} sessions")
180
+
181
+ if args.client == 'gemini':
182
+ results = process_with_gemini(args, configs)
183
+ elif args.client == 'vllm':
184
+ results = process_with_vllm(args, configs)
185
+ elif args.client == 'bigquery':
186
+ results = process_with_bigquery(args, configs)
187
+ else:
188
+ raise ValueError(f"Unknown client: {args.client}")
189
+
190
+ print(f"✓ Processed {len(results)} sessions")
191
+
192
+ if args.visualize:
193
+ print("\nCreating visualizations...")
194
+ visualizer = Visualizer(args.annotate)
195
+
196
+ for config in configs:
197
+ if not config.matched_captions_jsonl.exists():
198
+ print(f"Skipping Visualizing {config.session_id}: no data.jsonl")
199
+ continue
200
+
201
+ try:
202
+ output = config.session_folder / "annotated.mp4"
203
+ visualizer.visualize(config.session_folder, output, args.fps)
204
+ print(f"✓ {config.session_id}: {output}")
205
+ except Exception as e:
206
+ print(f"✗ {config.session_id}: {e}")
207
+
208
+
209
+ if __name__ == '__main__':
210
+ main()
@@ -0,0 +1,150 @@
1
+ from pathlib import Path
2
+ from typing import List, Dict, Any, Optional
3
+ import json
4
+
5
+
6
+ def match_captions_with_events(
7
+ captions_path: Path,
8
+ aggregations_path: Path,
9
+ output_path: Path,
10
+ fps: int = 1
11
+ ) -> List[Dict[str, Any]]:
12
+ """
13
+ Match captions with aggregated events based on timestamps.
14
+
15
+ Args:
16
+ captions_path: Path to captions.jsonl
17
+ aggregations_path: Path to aggregations.jsonl
18
+ output_path: Path to save matched_captions.jsonl
19
+ fps: Frames per second used in video creation
20
+
21
+ Returns:
22
+ List of matched caption-event objects
23
+ """
24
+ # Load captions
25
+ captions = []
26
+ with open(captions_path, 'r', encoding='utf-8') as f:
27
+ for line in f:
28
+ if line.strip():
29
+ captions.append(json.loads(line))
30
+
31
+ # Load aggregations
32
+ aggregations = []
33
+ with open(aggregations_path, 'r', encoding='utf-8') as f:
34
+ for line in f:
35
+ if line.strip():
36
+ agg = json.loads(line)
37
+ aggregations.append(agg)
38
+
39
+ # Sort aggregations by timestamp
40
+ aggregations.sort(key=lambda x: x.get('timestamp', 0))
41
+
42
+ if not aggregations:
43
+ print("[Matcher] Warning: No aggregations found")
44
+ return []
45
+
46
+ # Get first aggregation timestamp (video start time)
47
+ first_timestamp = aggregations[0].get('timestamp', 0)
48
+
49
+ print(f"[Matcher] Video start time: {first_timestamp}")
50
+ print(f"[Matcher] Total aggregations: {len(aggregations)}")
51
+ print(f"[Matcher] FPS: {fps}")
52
+
53
+ # Match captions with events
54
+ matched_data = []
55
+
56
+ for caption in captions:
57
+ # Convert MM:SS to seconds
58
+ start_seconds = caption['start_seconds']
59
+ end_seconds = caption['end_seconds']
60
+
61
+ # Convert video time to aggregation indices
62
+ # Each aggregation represents 1 frame, so index = seconds * fps
63
+ start_index = int(start_seconds * fps)
64
+ end_index = int(end_seconds * fps)
65
+
66
+ # Clamp to valid range
67
+ start_index = max(0, min(start_index, len(aggregations) - 1))
68
+ end_index = max(start_index, min(end_index, len(aggregations) - 1))
69
+
70
+ print(f"[Matcher] Caption '{caption['caption'][:50]}...' -> indices [{start_index}, {end_index}]")
71
+
72
+ # Get aggregations in this range
73
+ matched_aggs = aggregations[start_index:end_index + 1]
74
+
75
+ if not matched_aggs:
76
+ # No events matched, but still save the caption
77
+ matched_entry = {
78
+ "start_time": first_timestamp + start_seconds,
79
+ "end_time": first_timestamp + end_seconds,
80
+ "start_index": start_index,
81
+ "end_index": end_index,
82
+ "img": None,
83
+ "caption": caption['caption'],
84
+ "raw_events": [],
85
+ "num_aggregations": 0,
86
+ "start_formatted": caption['start'],
87
+ "end_formatted": caption['end'],
88
+ }
89
+ else:
90
+ # Get first and last aggregation for time and image
91
+ first_agg = matched_aggs[0]
92
+ last_agg = matched_aggs[-1]
93
+
94
+ # Concatenate all events from matched aggregations
95
+ all_events = []
96
+ for agg in matched_aggs:
97
+ events = agg.get('events', [])
98
+ all_events.extend(events)
99
+
100
+ matched_entry = {
101
+ "start_time": first_agg.get('timestamp'),
102
+ "end_time": last_agg.get('timestamp'),
103
+ "start_index": start_index,
104
+ "end_index": end_index,
105
+ "img": first_agg.get('screenshot_path'),
106
+ "caption": caption['caption'],
107
+ "raw_events": all_events,
108
+ "num_aggregations": len(matched_aggs),
109
+ "start_formatted": caption['start'],
110
+ "end_formatted": caption['end'],
111
+ }
112
+
113
+ matched_data.append(matched_entry)
114
+
115
+ # Save matched data
116
+ with open(output_path, 'w', encoding='utf-8') as f:
117
+ for entry in matched_data:
118
+ f.write(json.dumps(entry, ensure_ascii=False) + '\n')
119
+
120
+ print(f"[Matcher] Saved {len(matched_data)} matched entries to {output_path}")
121
+
122
+ return matched_data
123
+
124
+
125
+ def create_matched_captions_for_session(session_dir: Path, fps: int = 1) -> Optional[Path]:
126
+ """
127
+ Create matched_captions.jsonl for a session directory.
128
+
129
+ Args:
130
+ session_dir: Path to session directory
131
+ fps: Frames per second used in video creation
132
+
133
+ Returns:
134
+ Path to created matched_captions.jsonl or None if failed
135
+ """
136
+ captions_path = session_dir / "captions.jsonl"
137
+ aggregations_path = session_dir / "aggregations.jsonl"
138
+ output_path = session_dir / "matched_captions.jsonl"
139
+
140
+ if not captions_path.exists():
141
+ print(f"[Matcher] Warning: {captions_path} not found")
142
+ return None
143
+
144
+ if not aggregations_path.exists():
145
+ print(f"[Matcher] Warning: {aggregations_path} not found")
146
+ return None
147
+
148
+ match_captions_with_events(captions_path, aggregations_path, output_path, fps)
149
+
150
+ return output_path
@@ -0,0 +1,28 @@
1
+ from label.clients.client import VLMClient, CAPTION_SCHEMA
2
+ from label.clients.gemini import GeminiClient, GeminiResponse
3
+ from label.clients.vllm import VLLMClient, VLLMResponse
4
+ from label.clients.bigquery import BigQueryClient, BigQueryResponse
5
+
6
+
7
+ def create_client(client_type: str, **kwargs) -> VLMClient:
8
+ if client_type == 'gemini':
9
+ return GeminiClient(**kwargs)
10
+ elif client_type == 'vllm':
11
+ return VLLMClient(**kwargs)
12
+ elif client_type == 'bigquery':
13
+ return BigQueryClient(**kwargs)
14
+ else:
15
+ raise ValueError(f"Unknown client type: {client_type}")
16
+
17
+
18
+ __all__ = [
19
+ "VLMClient",
20
+ "GeminiClient",
21
+ "GeminiResponse",
22
+ "VLLMClient",
23
+ "VLLMResponse",
24
+ "BigQueryClient",
25
+ "BigQueryResponse",
26
+ "CAPTION_SCHEMA",
27
+ "create_client",
28
+ ]