pixel-patrol-base 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixel_patrol_base-0.1.0/PKG-INFO +18 -0
- pixel_patrol_base-0.1.0/pyproject.toml +41 -0
- pixel_patrol_base-0.1.0/setup.cfg +4 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/api.py +79 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/cli.py +130 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/config.py +7 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/contracts.py +34 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/feature_schema.py +85 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/file_system.py +159 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/processing.py +169 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/project.py +240 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/project_settings.py +12 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/record.py +116 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/specs.py +46 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/core/validation.py +122 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/io/project_io.py +407 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugin_registry.py +79 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/processors/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/processors/basic_stats_processor.py +38 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/processors/histogram_processor.py +109 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/processors/thumbnail_processor.py +94 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/base_dynamic_table_widget.py +144 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/dataset_stats/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/dataset_stats/dataset_histograms.py +270 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/dataset_stats/dataset_stats.py +50 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/dataset_stats/dynamic_dataset_metrics.py +22 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/file_stats/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/file_stats/file_stats.py +163 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/metadata/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/metadata/data_type.py +98 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/metadata/dim_order.py +97 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/metadata/dim_size.py +148 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/summary/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/summary/dataframe.py +53 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/summary/file_summary.py +98 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/summary/sunburst.py +155 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/visualization/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/visualization/embedding_projector.py +294 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/plugins/widgets/visualization/image_mosaik.py +196 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/report/__init__.py +0 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/report/dashboard_app.py +219 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/report/utils.py +323 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/report/widget.py +15 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/report/widget_categories.py +14 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/utils/array_utils.py +131 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/utils/df_utils.py +104 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/utils/path_utils.py +93 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base/utils/utils.py +19 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base.egg-info/PKG-INFO +18 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base.egg-info/SOURCES.txt +63 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base.egg-info/dependency_links.txt +1 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base.egg-info/entry_points.txt +8 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base.egg-info/requires.txt +12 -0
- pixel_patrol_base-0.1.0/src/pixel_patrol_base.egg-info/top_level.txt +1 -0
- pixel_patrol_base-0.1.0/tests/test_add_and_delete_paths.py +361 -0
- pixel_patrol_base-0.1.0/tests/test_cli_paths.py +131 -0
- pixel_patrol_base-0.1.0/tests/test_create_project.py +72 -0
- pixel_patrol_base-0.1.0/tests/test_file_system.py +284 -0
- pixel_patrol_base-0.1.0/tests/test_paths_utils.py +53 -0
- pixel_patrol_base-0.1.0/tests/test_project_io.py +805 -0
- pixel_patrol_base-0.1.0/tests/test_record.py +72 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pixel-patrol-base
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Image prevalidation tool
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: click>=8.2.1
|
|
8
|
+
Requires-Dist: dash-ag-grid>=32.3.0
|
|
9
|
+
Requires-Dist: dash-bootstrap-components>=2.0.3
|
|
10
|
+
Requires-Dist: matplotlib>=3.10.3
|
|
11
|
+
Requires-Dist: polars>=1.31.0
|
|
12
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
13
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
14
|
+
Requires-Dist: statsmodels>=0.14.4
|
|
15
|
+
Requires-Dist: tensorboard>=2.18.0
|
|
16
|
+
Requires-Dist: tensorboardx>=2.6.4
|
|
17
|
+
Requires-Dist: dask>=2025.5.1
|
|
18
|
+
Requires-Dist: tqdm>=4.67.1
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pixel-patrol-base"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Image prevalidation tool"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"click>=8.2.1",
|
|
9
|
+
"dash-ag-grid>=32.3.0",
|
|
10
|
+
"dash-bootstrap-components>=2.0.3",
|
|
11
|
+
"matplotlib>=3.10.3",
|
|
12
|
+
"polars>=1.31.0",
|
|
13
|
+
"pyarrow>=21.0.0",
|
|
14
|
+
"pyyaml>=6.0.2",
|
|
15
|
+
"statsmodels>=0.14.4",
|
|
16
|
+
"tensorboard>=2.18.0", # TODO: Are we sure we want it in base?
|
|
17
|
+
"tensorboardx>=2.6.4", # TODO: Are we sure we want it in base?
|
|
18
|
+
"dask>=2025.5.1",
|
|
19
|
+
"tqdm>=4.67.1",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.uv]
|
|
23
|
+
package = true
|
|
24
|
+
|
|
25
|
+
[tool.hatch.build.targets.wheel]
|
|
26
|
+
packages = ["src/pixel_patrol_base"]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
pixel-patrol = "pixel_patrol_base.cli:cli"
|
|
30
|
+
|
|
31
|
+
[project.entry-points."pixel_patrol.processor_plugins"]
|
|
32
|
+
pixel_patrol_base_processing_builtins = "pixel_patrol_base.plugin_registry:register_processor_plugins"
|
|
33
|
+
|
|
34
|
+
[project.entry-points."pixel_patrol.widget_plugins"]
|
|
35
|
+
pixel_patrol_base_widget_builtins = "pixel_patrol_base.plugin_registry:register_widget_plugins"
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=8.3.5",
|
|
40
|
+
"pytest-mock>=3.14.1",
|
|
41
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Union, Iterable, List, Optional
|
|
3
|
+
import polars as pl
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from pixel_patrol_base.core.project import Project
|
|
7
|
+
from pixel_patrol_base.core.project_settings import Settings
|
|
8
|
+
from pixel_patrol_base.io.project_io import export_project as _io_export_project
|
|
9
|
+
from pixel_patrol_base.io.project_io import import_project as _io_import_project
|
|
10
|
+
from pixel_patrol_base.report.dashboard_app import create_app
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
def create_project(name: str, base_dir: Union[str, Path], loader: str = None) -> Project:
|
|
16
|
+
logger.info(f"API Call: Creating new project '{name}' with base directory '{base_dir}'.")
|
|
17
|
+
return Project(name, base_dir, loader)
|
|
18
|
+
|
|
19
|
+
def add_paths(project: Project, paths: Union[str, Path, Iterable[Union[str, Path]]]) -> Project:
|
|
20
|
+
logger.info(f"API Call: Adding paths to project '{project.name}'.")
|
|
21
|
+
return project.add_paths(paths)
|
|
22
|
+
|
|
23
|
+
def delete_path(project: Project, path: str) -> Project:
|
|
24
|
+
logger.info(f"API Call: deleting paths from project '{project.name}'.")
|
|
25
|
+
return project.delete_path(path)
|
|
26
|
+
|
|
27
|
+
def set_settings(project: Project, settings: Settings) -> Project:
|
|
28
|
+
"""
|
|
29
|
+
Sets the project-specific settings by replacing the entire Settings object.
|
|
30
|
+
Detailed validation for individual settings is performed within the Project class itself.
|
|
31
|
+
Args:
|
|
32
|
+
project: The Project instance to update.
|
|
33
|
+
settings: An instance of the Settings dataclass containing the desired settings.
|
|
34
|
+
"""
|
|
35
|
+
logger.info(f"API Call: Attempting to set project settings for '{project.name}'.")
|
|
36
|
+
return project.set_settings(settings)
|
|
37
|
+
|
|
38
|
+
def process_files(project: Project) -> Project:
|
|
39
|
+
logger.info(f"API Call: Processing files and building DataFrame for project '{project.name}'.")
|
|
40
|
+
return project.process_records()
|
|
41
|
+
|
|
42
|
+
def show_report(project: Project, host: str = "127.0.0.1", port: int = None, debug: bool = False) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Run without the Flask debug reloader by default. The debug reloader
|
|
45
|
+
spawns a second process and will re-import/run the script, which causes
|
|
46
|
+
the example to execute twice (scan/process files two times). When a
|
|
47
|
+
developer needs the interactive debugger they can pass `debug=True`,
|
|
48
|
+
but should also set `use_reloader=False` if they do not want the script
|
|
49
|
+
re-executed.
|
|
50
|
+
"""
|
|
51
|
+
logger.info(f"API Call: Showing report for project '{project.name}'.")
|
|
52
|
+
app = create_app(project)
|
|
53
|
+
app.run(debug=debug, host=host, port=port, use_reloader=False)
|
|
54
|
+
|
|
55
|
+
def export_project(project: Project, dest: Path) -> None: # TODO: think about when project can be saved
|
|
56
|
+
logger.info(f"API Call: Exporting project '{project.name}' to '{dest}'.")
|
|
57
|
+
_io_export_project(project, dest)
|
|
58
|
+
logger.info(f"API Call: Project '{project.name}' exported successfully.")
|
|
59
|
+
|
|
60
|
+
def import_project(src: Path) -> Project:
|
|
61
|
+
logger.info(f"API Call: Importing project from '{src}'.")
|
|
62
|
+
project = _io_import_project(src)
|
|
63
|
+
logger.info(f"API Call: Project '{project.name}' imported successfully from '{src}'.")
|
|
64
|
+
return project
|
|
65
|
+
|
|
66
|
+
def get_name(project: Project) -> str:
|
|
67
|
+
return project.get_name()
|
|
68
|
+
|
|
69
|
+
def get_base_dir(project: Project) -> Optional[Path]:
|
|
70
|
+
return project.get_base_dir()
|
|
71
|
+
|
|
72
|
+
def get_paths(project: Project) -> List[Path]:
|
|
73
|
+
return project.get_paths()
|
|
74
|
+
|
|
75
|
+
def get_settings(project: Project) -> Settings:
|
|
76
|
+
return project.get_settings()
|
|
77
|
+
|
|
78
|
+
def get_records_df(project: Project) -> Optional[pl.DataFrame]:
|
|
79
|
+
return project.get_records_df()
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import webbrowser
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from threading import Timer
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from pixel_patrol_base.api import (
|
|
9
|
+
create_project,
|
|
10
|
+
add_paths,
|
|
11
|
+
set_settings,
|
|
12
|
+
process_files,
|
|
13
|
+
export_project,
|
|
14
|
+
import_project,
|
|
15
|
+
show_report,
|
|
16
|
+
)
|
|
17
|
+
from pixel_patrol_base.core.project_settings import Settings
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@click.group()
|
|
21
|
+
def cli():
|
|
22
|
+
"""
|
|
23
|
+
A command-line tool for processing image reports with Pixel Patrol.
|
|
24
|
+
|
|
25
|
+
This tool facilitates a two-step process:
|
|
26
|
+
1. Exporting a processed project to a ZIP file.
|
|
27
|
+
2. Displaying a report from an exported ZIP file.
|
|
28
|
+
"""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
@cli.command()
|
|
32
|
+
@click.argument('base_directory', type=click.Path(exists=True, file_okay=False, dir_okay=True, readable=True, path_type=Path))
|
|
33
|
+
@click.option('--output-zip', '-o', type=click.Path(exists=False, dir_okay=False, writable=True, path_type=Path),
|
|
34
|
+
help='Required: Name of the output ZIP file for the exported project (e.g., my_project.zip).',
|
|
35
|
+
required=True)
|
|
36
|
+
@click.option('--name', type=str, required=False,
|
|
37
|
+
help='Optional: Name of the project. If not provided, derived from BASE_DIRECTORY.')
|
|
38
|
+
@click.option('--paths', '-p', multiple=True, type=str,
|
|
39
|
+
help='Optional: Paths (subdirectories) to treat as **experimental conditions**, relative to BASE_DIRECTORY. '
|
|
40
|
+
'Can be specified multiple times. If omitted, all immediate subdirectories '
|
|
41
|
+
'of BASE_DIRECTORY will be included, or if BASE_DIRECTORY has no subdirectories, '
|
|
42
|
+
'it is treated as a single condition.')
|
|
43
|
+
@click.option('--loader', '-l', type=str, show_default=True,
|
|
44
|
+
help='Recommended: Pixel Patrol file loader (e.g., bioio, zarr). If omitted, only basic file info is collected.')
|
|
45
|
+
@click.option('--cmap', type=str, default="rainbow", show_default=True,
|
|
46
|
+
help='Colormap for report visualization (e.g., viridis, plasma, rainbow).')
|
|
47
|
+
@click.option('--n-example-files', type=int, default=9, show_default=True,
|
|
48
|
+
help='Number of example files to display in the report.')
|
|
49
|
+
@click.option('--file-extension', '-e', multiple=True,
|
|
50
|
+
help='Optional: File extensions to include (e.g., png, jpg). Can be specified multiple times. '
|
|
51
|
+
'If not specified, all supported extensions will be used.')
|
|
52
|
+
@click.option('--flavor', type=str, default="", show_default=True,
|
|
53
|
+
help='Name of pixel patrol configuration, will be displayed next to the tool name.')
|
|
54
|
+
def export(base_directory: Path, output_zip: Path, name: str | None, paths: tuple[str, ...],
|
|
55
|
+
loader: str, cmap: str, n_example_files: int, file_extension: tuple[str, ...], flavor: str):
|
|
56
|
+
"""
|
|
57
|
+
Exports a Pixel Patrol project to a ZIP file.
|
|
58
|
+
Processes images from the BASE_DIRECTORY and specified --paths.
|
|
59
|
+
"""
|
|
60
|
+
# Always operate on an absolute base directory so downstream path resolution is stable.
|
|
61
|
+
base_directory = base_directory.resolve()
|
|
62
|
+
|
|
63
|
+
# Derive project_name if not provided
|
|
64
|
+
if name is None:
|
|
65
|
+
name = base_directory.name # Use the name of the base directory
|
|
66
|
+
click.echo(f"Project name not provided, deriving from base directory: '{name}'")
|
|
67
|
+
|
|
68
|
+
click.echo(f"Creating project: '{name}' from base directory '{base_directory}'")
|
|
69
|
+
my_project = create_project(name, str(base_directory), loader=loader) # Assuming create_project takes string path
|
|
70
|
+
|
|
71
|
+
if paths:
|
|
72
|
+
click.echo(f"Adding explicitly specified paths: {', '.join(paths)}. Resolution will be relative to '{base_directory}'")
|
|
73
|
+
add_paths(my_project, paths)
|
|
74
|
+
else:
|
|
75
|
+
# If no paths, we want to add the base directory itself.
|
|
76
|
+
click.echo(f"No --paths specified. Processing all images in '{base_directory}'.")
|
|
77
|
+
add_paths(my_project, base_directory)
|
|
78
|
+
|
|
79
|
+
selected_extensions = set(file_extension) if file_extension else "all"
|
|
80
|
+
initial_settings = Settings(
|
|
81
|
+
cmap=cmap,
|
|
82
|
+
n_example_files=n_example_files,
|
|
83
|
+
selected_file_extensions=selected_extensions,
|
|
84
|
+
pixel_patrol_flavor=flavor
|
|
85
|
+
)
|
|
86
|
+
click.echo(f"Setting project settings: {initial_settings}")
|
|
87
|
+
set_settings(my_project, initial_settings)
|
|
88
|
+
|
|
89
|
+
click.echo("Processing images...")
|
|
90
|
+
process_files(my_project)
|
|
91
|
+
|
|
92
|
+
click.echo(f"Exporting project to: '{output_zip}'")
|
|
93
|
+
export_project(my_project, Path(output_zip)) # Assuming export_project takes string path
|
|
94
|
+
click.echo("Export complete.")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@cli.command()
|
|
98
|
+
@click.argument('input_zip', type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, path_type=Path))
|
|
99
|
+
@click.option('--port', type=int, default=8050, show_default=True,
|
|
100
|
+
help='Port number for the Dash report server.')
|
|
101
|
+
def report(input_zip: Path, port: int):
|
|
102
|
+
"""
|
|
103
|
+
Displays the report of an exported Pixel Patrol project from a ZIP file.
|
|
104
|
+
"""
|
|
105
|
+
click.echo(f"Importing project from: '{input_zip}'")
|
|
106
|
+
my_project = import_project(Path(input_zip))
|
|
107
|
+
click.echo("Project imported.")
|
|
108
|
+
|
|
109
|
+
report_url = f"http://127.0.0.1:{port}"
|
|
110
|
+
click.echo(f"Dash report will run on {report_url}/")
|
|
111
|
+
click.echo("Attempting to open report in your default browser...")
|
|
112
|
+
|
|
113
|
+
# We don't need a Timer here if show_report is blocking and we open BEFORE
|
|
114
|
+
# However, in some systems, opening too fast can fail. A small delay is safer.
|
|
115
|
+
def _open_browser():
|
|
116
|
+
# This check is still useful if Werkzeug debug mode spawns a second process
|
|
117
|
+
# and you want to ensure it only opens once from the parent.
|
|
118
|
+
if not os.environ.get("WERKZEUG_RUN_MAIN"):
|
|
119
|
+
webbrowser.open_new_tab(report_url)
|
|
120
|
+
|
|
121
|
+
# Schedule the browser open for slightly in the future to give the OS a moment
|
|
122
|
+
# and to potentially allow Dash's own startup messages to appear first.
|
|
123
|
+
Timer(1, _open_browser).start() # 0.5 seconds delay
|
|
124
|
+
|
|
125
|
+
click.echo("Showing report...")
|
|
126
|
+
# Pass the port to show_report. You must ensure show_report accepts this.
|
|
127
|
+
show_report(my_project, port=port)
|
|
128
|
+
|
|
129
|
+
if __name__ == '__main__':
|
|
130
|
+
cli()
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Protocol, Iterable, Set, Any, Dict, List
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from pixel_patrol_base.core.record import Record
|
|
7
|
+
from pixel_patrol_base.core.specs import ProcessResult, RecordSpec, ProcessorOutput
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PixelPatrolLoader(Protocol):
|
|
11
|
+
NAME: str
|
|
12
|
+
SUPPORTED_EXTENSIONS: Set[str]
|
|
13
|
+
OUTPUT_SCHEMA: Dict[str, Any]
|
|
14
|
+
OUTPUT_SCHEMA_PATTERNS: List[tuple[str, Any]]
|
|
15
|
+
FOLDER_EXTENSIONS: Set[str]
|
|
16
|
+
def is_folder_supported(self, path: Path) -> bool: ...
|
|
17
|
+
def load(self, source: str) -> Record: ...
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PixelPatrolProcessor(Protocol):
|
|
21
|
+
NAME: str
|
|
22
|
+
INPUT: RecordSpec
|
|
23
|
+
OUTPUT: ProcessorOutput # "features" or "record"
|
|
24
|
+
def run(self, art: Record) -> ProcessResult: ...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PixelPatrolWidget(Protocol):
|
|
28
|
+
NAME: str # human readable name
|
|
29
|
+
TAB: str # WidgetCategories value
|
|
30
|
+
REQUIRES: Set[str] # columns required to render
|
|
31
|
+
REQUIRES_PATTERNS: Iterable[str] | None # optional regexes for dynamic cols
|
|
32
|
+
|
|
33
|
+
def layout(self) -> list: ...
|
|
34
|
+
def register(self, app, df_global: pl.DataFrame) -> None: ...
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Tuple, Type
|
|
5
|
+
|
|
6
|
+
from pixel_patrol_base.core.contracts import PixelPatrolLoader
|
|
7
|
+
|
|
8
|
+
Schema = Dict[str, Any]
|
|
9
|
+
PatternSpec = List[Tuple[str, Any]]
|
|
10
|
+
|
|
11
|
+
def merge_output_schemas(processors: Iterable[object]) -> Tuple[Schema, PatternSpec]:
|
|
12
|
+
static: Schema = {}
|
|
13
|
+
patterns: PatternSpec = []
|
|
14
|
+
for p in processors:
|
|
15
|
+
static.update(getattr(p, "OUTPUT_SCHEMA", {}) or {})
|
|
16
|
+
patterns += getattr(p, "OUTPUT_SCHEMA_PATTERNS", []) or []
|
|
17
|
+
return static, patterns
|
|
18
|
+
|
|
19
|
+
def infer_col_type(col: str, static: Schema, patterns: PatternSpec):
|
|
20
|
+
if col in static:
|
|
21
|
+
return static[col]
|
|
22
|
+
for pat, typ in patterns:
|
|
23
|
+
if re.match(pat, col):
|
|
24
|
+
return typ
|
|
25
|
+
return None # unknown => leave as-is
|
|
26
|
+
|
|
27
|
+
def coerce_row_types(row: Dict[str, Any], static: Schema, patterns: PatternSpec) -> Dict[str, Any]:
|
|
28
|
+
out = dict(row)
|
|
29
|
+
for k, v in row.items():
|
|
30
|
+
want = infer_col_type(k, static, patterns)
|
|
31
|
+
if want is None or v is None:
|
|
32
|
+
continue
|
|
33
|
+
try:
|
|
34
|
+
if want is list and not isinstance(v, list): out[k] = [v]
|
|
35
|
+
elif want is dict and not isinstance(v, dict): pass # recommend JSON string instead
|
|
36
|
+
else: out[k] = want(v) # int/float/str
|
|
37
|
+
except Exception:
|
|
38
|
+
# keep original if coercion fails
|
|
39
|
+
pass
|
|
40
|
+
return out
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_requirements_as_patterns(component: Type[PixelPatrolLoader]) -> List[str]:
|
|
44
|
+
"""
|
|
45
|
+
Consolidates a component's (loader or processor) static and dynamic
|
|
46
|
+
column specifications into a single list of regex patterns.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
component: An instance of a loader or processor.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
A list of regex strings representing all required columns.
|
|
53
|
+
"""
|
|
54
|
+
# 1. Get exact keys from the static specification.
|
|
55
|
+
exact_keys_as_patterns = [
|
|
56
|
+
f"^{re.escape(key)}$" for key in component.OUTPUT_SCHEMA.keys()
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# 2. Get the regex patterns from the dynamic specification.
|
|
60
|
+
dynamic_patterns = [
|
|
61
|
+
pattern_tuple[0] for pattern_tuple in component.OUTPUT_SCHEMA_PATTERNS
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
return exact_keys_as_patterns + dynamic_patterns
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_dynamic_patterns(component: Type[PixelPatrolLoader]) -> List[str]:
|
|
68
|
+
return [
|
|
69
|
+
pattern_tuple[0] for pattern_tuple in component.OUTPUT_SCHEMA_PATTERNS
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
def patterns_from_processor(P) -> List[str]:
|
|
73
|
+
"""
|
|
74
|
+
Extract regex strings from a processor's declarative OUTPUT_SCHEMA_PATTERNS.
|
|
75
|
+
Accepts either a class or an instance.
|
|
76
|
+
"""
|
|
77
|
+
schema_patterns = getattr(P, "OUTPUT_SCHEMA_PATTERNS", None)
|
|
78
|
+
if schema_patterns is None and hasattr(P, "__class__"):
|
|
79
|
+
schema_patterns = getattr(P.__class__, "OUTPUT_SCHEMA_PATTERNS", None)
|
|
80
|
+
|
|
81
|
+
pats: List[str] = []
|
|
82
|
+
if schema_patterns:
|
|
83
|
+
for pat, _typ in schema_patterns:
|
|
84
|
+
pats.append(getattr(pat, "pattern", pat)) # handle compiled or plain string
|
|
85
|
+
return pats
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Any, List, Literal, Optional, Set
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from pixel_patrol_base.utils.utils import format_bytes_to_human_readable
|
|
9
|
+
from pixel_patrol_base.core.contracts import PixelPatrolLoader
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_basic_record(path: Path, base: Path, is_folder: bool = False) -> Dict[str, Any]:
|
|
15
|
+
"""
|
|
16
|
+
Create a basic metadata record for a file or folder,
|
|
17
|
+
computing depth relative to `base` and normalizing extensions.
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
stat_func = path.stat if not is_folder else lambda: None
|
|
21
|
+
st = stat_func() if not is_folder else None
|
|
22
|
+
except Exception as e:
|
|
23
|
+
logger.warning(f"Failed stat for {path}: {e}")
|
|
24
|
+
return {}
|
|
25
|
+
|
|
26
|
+
depth = len(path.parts) - len(base.parts)
|
|
27
|
+
|
|
28
|
+
# TODO: I guess we're missing imported_path_short and modification_month that were created in preprocess_files
|
|
29
|
+
# common_base = find_common_base(unique_folders) - should be added after
|
|
30
|
+
# pl.col("modification_date").dt.month().alias("modification_month"),
|
|
31
|
+
# pl.col("imported_path").str.replace(common_base, "", literal=True).alias("imported_path_short"),
|
|
32
|
+
record: Dict[str, Any] = {
|
|
33
|
+
"path": str(path),
|
|
34
|
+
"name": path.name,
|
|
35
|
+
"type": "folder" if is_folder else "file",
|
|
36
|
+
"parent": str(path.parent) if path != base else None,
|
|
37
|
+
"depth": depth,
|
|
38
|
+
"size_bytes": 0 if is_folder else st.st_size,
|
|
39
|
+
"modification_date": datetime.fromtimestamp(os.path.getmtime(path)),
|
|
40
|
+
"file_extension": None if is_folder else path.suffix.lstrip(".").lower(),
|
|
41
|
+
"imported_path": str(base),
|
|
42
|
+
}
|
|
43
|
+
return record
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def walk_filesystem(
|
|
47
|
+
bases: List[Path],
|
|
48
|
+
accepted_extensions: Set[str] | Literal["all"],
|
|
49
|
+
loader: Optional[PixelPatrolLoader] = None,
|
|
50
|
+
) -> pl.DataFrame:
|
|
51
|
+
"""
|
|
52
|
+
- Only include files and loader-supported folder datasets (no plain directories).
|
|
53
|
+
- accepted_extensions == "all": include all files + any folder datasets supported by the loader.
|
|
54
|
+
- accepted_extensions is a set: include files with suffix in set; include folder datasets only if they intersect loader.FOLDER_EXTENSIONS.
|
|
55
|
+
"""
|
|
56
|
+
records: List[dict] = []
|
|
57
|
+
include_all = accepted_extensions == "all"
|
|
58
|
+
|
|
59
|
+
is_folder_check = (loader is not None) and \
|
|
60
|
+
hasattr(loader, "is_folder_supported") and \
|
|
61
|
+
(include_all or
|
|
62
|
+
not accepted_extensions.isdisjoint(getattr(loader, "FOLDER_EXTENSIONS", set())))
|
|
63
|
+
folder_support_fn = loader.is_folder_supported if is_folder_check else None
|
|
64
|
+
|
|
65
|
+
for base in bases:
|
|
66
|
+
for root, dirnames, filenames in os.walk(base, topdown=True):
|
|
67
|
+
dirpath = Path(root)
|
|
68
|
+
|
|
69
|
+
keep: List[str] = []
|
|
70
|
+
|
|
71
|
+
if is_folder_check:
|
|
72
|
+
for d in dirnames:
|
|
73
|
+
sub = dirpath / d
|
|
74
|
+
if folder_support_fn(sub):
|
|
75
|
+
records.append(make_basic_record(sub, base, is_folder=False))
|
|
76
|
+
else:
|
|
77
|
+
keep.append(d)
|
|
78
|
+
dirnames[:] = keep
|
|
79
|
+
|
|
80
|
+
# Files
|
|
81
|
+
for name in filenames:
|
|
82
|
+
p = dirpath / name
|
|
83
|
+
if include_all or p.suffix.lower().lstrip(".") in accepted_extensions:
|
|
84
|
+
records.append(make_basic_record(p, base, is_folder=False))
|
|
85
|
+
|
|
86
|
+
return pl.DataFrame(records) if records else pl.DataFrame()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _aggregate_folder_sizes(df: pl.DataFrame) -> pl.DataFrame:
|
|
90
|
+
"""
|
|
91
|
+
Aggregates file sizes up to their parent folders in the DataFrame.
|
|
92
|
+
Assumes df contains 'path', 'type', 'parent', 'size_bytes', 'depth' columns.
|
|
93
|
+
This version aims to be more Polars-idiomatic.
|
|
94
|
+
"""
|
|
95
|
+
if df.is_empty():
|
|
96
|
+
return df
|
|
97
|
+
|
|
98
|
+
# Ensure 'size_bytes' is numerical
|
|
99
|
+
df = df.with_columns(pl.col("size_bytes").cast(pl.Int64))
|
|
100
|
+
|
|
101
|
+
# Initialize a 'current_size' column that will be updated
|
|
102
|
+
# Files keep their original size. Folders initially have 0 or their own direct size if applicable.
|
|
103
|
+
# The sum for folders will be calculated from their children.
|
|
104
|
+
df = df.with_columns(
|
|
105
|
+
pl.when(pl.col("type") == "file")
|
|
106
|
+
.then(pl.col("size_bytes"))
|
|
107
|
+
.otherwise(0) # Start folder size from 0, or could be initial direct size if it applies
|
|
108
|
+
.alias("temp_calculated_size")
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Get unique depths in reverse order to process from leaves upwards
|
|
112
|
+
# Filter out folders at depth 0, as they might not have a parent in the dataframe to aggregate to.
|
|
113
|
+
unique_depths = sorted(df["depth"].unique().to_list(), reverse=True)
|
|
114
|
+
|
|
115
|
+
# If your base directory is included as a folder with depth 0 and no parent in the df,
|
|
116
|
+
# the aggregation will stop there. This is generally desired.
|
|
117
|
+
|
|
118
|
+
# Iterate from deepest folders up to the base-level folders
|
|
119
|
+
for current_depth in unique_depths:
|
|
120
|
+
# Sum sizes of direct children at (current_depth + 1) for parents at current_depth
|
|
121
|
+
# We need to compute the sum of 'temp_calculated_size' for all children
|
|
122
|
+
# grouped by their 'parent' path (which corresponds to the current folder's path).
|
|
123
|
+
|
|
124
|
+
# Calculate children sizes to aggregate to parents at current_depth
|
|
125
|
+
# This aggregates sizes of *all* items (files and subfolders) at depth 'current_depth'
|
|
126
|
+
# based on their 'parent' column.
|
|
127
|
+
|
|
128
|
+
children_sums_for_parents = df.filter(pl.col("depth") == current_depth + 1) \
|
|
129
|
+
.group_by("parent") \
|
|
130
|
+
.agg(pl.col("temp_calculated_size").sum().alias("children_total_size"))
|
|
131
|
+
|
|
132
|
+
# Now, join these sums back to the main DataFrame
|
|
133
|
+
# Update the 'temp_calculated_size' for folders at 'current_depth'
|
|
134
|
+
# by adding the sum of their children.
|
|
135
|
+
|
|
136
|
+
df = df.join(
|
|
137
|
+
children_sums_for_parents,
|
|
138
|
+
left_on="path", # Folder's path is the parent for its children
|
|
139
|
+
right_on="parent",
|
|
140
|
+
how="left"
|
|
141
|
+
).with_columns(
|
|
142
|
+
pl.when(pl.col("type") == "folder")
|
|
143
|
+
.then(
|
|
144
|
+
pl.col("temp_calculated_size") + pl.col("children_total_size").fill_null(0)
|
|
145
|
+
)
|
|
146
|
+
.otherwise(pl.col("temp_calculated_size")) # Files keep their original size
|
|
147
|
+
.alias("temp_calculated_size")
|
|
148
|
+
).drop("children_total_size") # Drop the temporary join column
|
|
149
|
+
|
|
150
|
+
# After aggregation, the 'temp_calculated_size' column contains the final aggregated sizes.
|
|
151
|
+
# Replace the original 'size_bytes' with this aggregated column.
|
|
152
|
+
df = df.with_columns(pl.col("temp_calculated_size").alias("size_bytes")).drop("temp_calculated_size")
|
|
153
|
+
|
|
154
|
+
# Drop the temporary Path objects if they were created before
|
|
155
|
+
# (In this revised version, we don't create path_obj/parent_obj explicitly in the DF)
|
|
156
|
+
# If the initial scan_directory_to_dataframe already returns Path objects and they are stored
|
|
157
|
+
# as object dtype, they would need to be handled, but it's better to store strings then convert as needed.
|
|
158
|
+
|
|
159
|
+
return df
|