vlmparse 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
  2. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  3. vlmparse/benchpdf2md/create_dataset.py +60 -0
  4. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
  5. vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
  6. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
  7. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
  8. vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
  9. vlmparse/benchpdf2md/run_benchmark.py +296 -0
  10. vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
  11. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
  12. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
  13. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
  14. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
  15. vlmparse/benchpdf2md/utils.py +56 -0
  16. vlmparse/clients/chandra.py +323 -0
  17. vlmparse/clients/deepseekocr.py +52 -0
  18. vlmparse/clients/docling.py +146 -0
  19. vlmparse/clients/dotsocr.py +277 -0
  20. vlmparse/clients/granite_docling.py +132 -0
  21. vlmparse/clients/hunyuanocr.py +45 -0
  22. vlmparse/clients/lightonocr.py +43 -0
  23. vlmparse/clients/mineru.py +119 -0
  24. vlmparse/clients/nanonetocr.py +29 -0
  25. vlmparse/clients/olmocr.py +46 -0
  26. vlmparse/clients/openai_converter.py +173 -0
  27. vlmparse/clients/paddleocrvl.py +48 -0
  28. vlmparse/clients/pipe_utils/cleaner.py +74 -0
  29. vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
  30. vlmparse/clients/pipe_utils/utils.py +12 -0
  31. vlmparse/clients/prompts.py +66 -0
  32. vlmparse/data_model/box.py +551 -0
  33. vlmparse/data_model/document.py +148 -0
  34. vlmparse/servers/docker_server.py +199 -0
  35. vlmparse/servers/utils.py +250 -0
  36. vlmparse/st_viewer/fs_nav.py +53 -0
  37. vlmparse/st_viewer/st_viewer.py +80 -0
  38. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/METADATA +11 -1
  39. vlmparse-0.1.2.dist-info/RECORD +50 -0
  40. vlmparse-0.1.0.dist-info/RECORD +0 -13
  41. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/WHEEL +0 -0
  42. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/entry_points.txt +0 -0
  43. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/licenses/LICENSE +0 -0
  44. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,148 @@
1
+ import os
2
+ import traceback
3
+ import zipfile
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import orjson
8
+ from PIL import Image
9
+ from PIL import Image as PILImage
10
+ from pydantic import Field
11
+
12
+ from vlmparse.base_model import VLMParseBaseModel
13
+ from vlmparse.utils import from_base64, to_base64
14
+
15
+ from .box import BoundingBox
16
+
17
+
18
+ class ProcessingError(VLMParseBaseModel):
19
+ module_class: str
20
+ traceback: str
21
+
22
+ @classmethod
23
+ def from_class(cls, klass):
24
+ return cls(
25
+ module_class=type(klass).__name__,
26
+ traceback=traceback.format_exc(),
27
+ )
28
+
29
+
30
+ class Item(VLMParseBaseModel):
31
+ category: str
32
+ box: BoundingBox
33
+ text: str
34
+
35
+
36
+ class Page(VLMParseBaseModel):
37
+ text: str | None = None
38
+ raw_response: str | None = None
39
+ items: list[Item] | None = None
40
+ error: ProcessingError | None = None
41
+ buffer_image: Optional[Image.Image | str | dict] = None
42
+ latency: Optional[float] = None
43
+ """Time taken to process the page in seconds."""
44
+
45
+ @property
46
+ def image(self):
47
+ if isinstance(self.buffer_image, dict):
48
+ from vlmparse.build_doc import convert_specific_page_to_image, resize_image
49
+
50
+ image = convert_specific_page_to_image(
51
+ self.buffer_image["file_path"],
52
+ self.buffer_image["page_idx"],
53
+ self.buffer_image["dpi"],
54
+ )
55
+ image = resize_image(image, self.buffer_image["max_image_size"])
56
+ self.buffer_image = image
57
+
58
+ if isinstance(self.buffer_image, str):
59
+ self.buffer_image = from_base64(self.buffer_image)
60
+ return self.buffer_image
61
+
62
+ def get_image_with_boxes(self, layout=False):
63
+ from PIL import ImageDraw
64
+
65
+ from .box import draw_text_of_box
66
+
67
+ image = self.image
68
+
69
+ if layout:
70
+ if self.items is None:
71
+ return image
72
+ items = self.items
73
+ for item in items:
74
+ box = item.box
75
+
76
+ draw = ImageDraw.Draw(image)
77
+ draw.rectangle(
78
+ (box.l, box.t, box.r, box.b),
79
+ outline=(255, 0, 0),
80
+ width=5,
81
+ )
82
+
83
+ image = draw_text_of_box(
84
+ image, box.l, box.t, item.category, font_size=40
85
+ )
86
+ return image
87
+
88
+
89
+ class Document(VLMParseBaseModel):
90
+ file_path: str
91
+ pages: list[Page] = []
92
+ error: ProcessingError | None = None
93
+ metadata: dict = Field(default_factory=dict)
94
+ latency: Optional[float] = None
95
+ """Time taken to process the document in seconds."""
96
+
97
+ @property
98
+ def text(self):
99
+ return "\n\n".join([page.text for page in self.pages])
100
+
101
+ @property
102
+ def is_error(self):
103
+ return self.error is not None or any(
104
+ page.error is not None for page in self.pages
105
+ )
106
+
107
+ def to_zip(
108
+ self,
109
+ file_path,
110
+ overwrite_file: bool = True,
111
+ image_extension: str = "webp",
112
+ ):
113
+ file_path = Path(file_path)
114
+ os.makedirs(file_path.parent, exist_ok=True)
115
+ archive_path = str(file_path).removesuffix(".zip") + ".zip"
116
+
117
+ if not overwrite_file:
118
+ assert not os.path.isfile(archive_path)
119
+
120
+ def _custom_encoder(x):
121
+ if isinstance(x, PILImage.Image):
122
+ return to_base64(x, image_extension)
123
+ if isinstance(x, str):
124
+ return x
125
+ raise TypeError(
126
+ f"Object of type {type(x).__name__} is not JSON serializable"
127
+ )
128
+
129
+ json_bytes = orjson.dumps(
130
+ self.model_dump(),
131
+ default=_custom_encoder,
132
+ option=orjson.OPT_INDENT_2,
133
+ )
134
+
135
+ with zipfile.ZipFile(
136
+ archive_path, "w", compression=zipfile.ZIP_DEFLATED
137
+ ) as zipf:
138
+ zipf.writestr("data.json", json_bytes)
139
+
140
+ @classmethod
141
+ def from_zip(cls, file_path):
142
+ with zipfile.ZipFile(file_path, "r") as zipf:
143
+ if "data.json" not in zipf.namelist():
144
+ raise FileNotFoundError("data.json not found in the archive")
145
+
146
+ json_bytes = zipf.read("data.json")
147
+ data = orjson.loads(json_bytes)
148
+ return cls.model_validate(data)
@@ -0,0 +1,199 @@
1
+ import os
2
+ from typing import Callable
3
+
4
+ from loguru import logger
5
+ from pydantic import BaseModel, Field
6
+
7
+ from .utils import docker_server
8
+
9
+
10
+ class DockerServerConfig(BaseModel):
11
+ """Base configuration for deploying a Docker server."""
12
+
13
+ model_name: str
14
+ docker_image: str
15
+ dockerfile_dir: str | None = None
16
+ command_args: list[str] = Field(default_factory=list)
17
+ server_ready_indicators: list[str] = Field(
18
+ default_factory=lambda: [
19
+ "Application startup complete",
20
+ "Uvicorn running",
21
+ "Starting vLLM API server",
22
+ ]
23
+ )
24
+ docker_port: int = 8056
25
+ gpu_device_ids: list[str] | None = None
26
+ container_port: int = 8000
27
+ environment: dict[str, str] = Field(default_factory=dict)
28
+ volumes: dict[str, dict] | None = None
29
+ entrypoint: str | None = None
30
+ aliases: list[str] = Field(default_factory=list)
31
+
32
+ class Config:
33
+ extra = "allow"
34
+
35
+ @property
36
+ def client_config(self):
37
+ """Override in subclasses to return appropriate client config."""
38
+ raise NotImplementedError
39
+
40
+ def get_client(self, **kwargs):
41
+ return self.client_config.get_client(**kwargs)
42
+
43
+ def get_server(self, auto_stop: bool = True):
44
+ return ConverterServer(config=self, auto_stop=auto_stop)
45
+
46
+ def get_command(self) -> list[str] | None:
47
+ """Build command for container. Override in subclasses for specific logic."""
48
+ return self.command_args if self.command_args else None
49
+
50
+ def get_volumes(self) -> dict | None:
51
+ """Setup volumes for container. Override in subclasses for specific logic."""
52
+ return self.volumes
53
+
54
+ def get_environment(self) -> dict | None:
55
+ """Setup environment variables. Override in subclasses for specific logic."""
56
+ return self.environment if self.environment else None
57
+
58
+ def get_base_url_suffix(self) -> str:
59
+ """Return URL suffix (e.g., '/v1' for OpenAI-compatible APIs). Override in subclasses."""
60
+ return ""
61
+
62
+
63
+ DEFAULT_MODEL_NAME = "vllm-model"
64
+
65
+
66
+ class VLLMDockerServerConfig(DockerServerConfig):
67
+ """Configuration for deploying a VLLM Docker server."""
68
+
69
+ docker_image: str = "vllm/vllm-openai:latest"
70
+ default_model_name: str = DEFAULT_MODEL_NAME
71
+ hf_home_folder: str | None = os.getenv("HF_HOME", None)
72
+ add_model_key_to_server: bool = False
73
+ container_port: int = 8000
74
+ aliases: list[str] = Field(default_factory=list)
75
+
76
+ @property
77
+ def llm_params(self):
78
+ from vlmparse.clients.openai_converter import LLMParams
79
+
80
+ return LLMParams(
81
+ base_url=f"http://localhost:{self.docker_port}/v1",
82
+ model_name=self.default_model_name,
83
+ )
84
+
85
+ @property
86
+ def client_config(self):
87
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
88
+
89
+ return OpenAIConverterConfig(llm_params=self.llm_params)
90
+
91
+ def get_command(self) -> list[str]:
92
+ """Build VLLM-specific command."""
93
+ model_key = ["--model"] if self.add_model_key_to_server else []
94
+ command = (
95
+ model_key
96
+ + [
97
+ self.model_name,
98
+ "--port",
99
+ str(self.container_port),
100
+ ]
101
+ + self.command_args
102
+ + ["--served-model-name", self.default_model_name]
103
+ )
104
+ return command
105
+
106
+ def get_volumes(self) -> dict | None:
107
+ """Setup volumes for HuggingFace model caching."""
108
+ if self.hf_home_folder is not None:
109
+ from pathlib import Path
110
+
111
+ return {
112
+ str(Path(self.hf_home_folder).absolute()): {
113
+ "bind": "/root/.cache/huggingface",
114
+ "mode": "rw",
115
+ }
116
+ }
117
+ return None
118
+
119
+ def get_environment(self) -> dict | None:
120
+ """Setup environment variables for VLLM."""
121
+ if self.hf_home_folder is not None:
122
+ return {
123
+ "HF_HOME": self.hf_home_folder,
124
+ "TRITON_CACHE_DIR": self.hf_home_folder,
125
+ }
126
+ return None
127
+
128
+ def get_base_url_suffix(self) -> str:
129
+ """VLLM uses OpenAI-compatible API with /v1 suffix."""
130
+ return "/v1"
131
+
132
+
133
+ class ConverterServer:
134
+ """Manages Docker server lifecycle with start/stop methods."""
135
+
136
+ def __init__(self, config: DockerServerConfig, auto_stop: bool = True):
137
+ self.config = config
138
+ self.auto_stop = auto_stop
139
+ self._server_context = None
140
+ self._container = None
141
+ self.base_url = None
142
+
143
+ def start(self):
144
+ """Start the Docker server."""
145
+ if self._server_context is not None:
146
+ logger.warning("Server already started")
147
+ return self.base_url
148
+
149
+ # Use the generic docker_server for all server types
150
+ self._server_context = docker_server(config=self.config, cleanup=self.auto_stop)
151
+
152
+ self.base_url, self._container = self._server_context.__enter__()
153
+ logger.info(f"Server started at {self.base_url}")
154
+ logger.info(f"Container ID: {self._container.id}")
155
+ logger.info(f"Container name: {self._container.name}")
156
+ return self.base_url, self._container
157
+
158
+ def stop(self):
159
+ """Stop the Docker server."""
160
+ if self._server_context is not None:
161
+ self._server_context.__exit__(None, None, None)
162
+ self._server_context = None
163
+ self._container = None
164
+ self.base_url = None
165
+ logger.info("Server stopped")
166
+
167
+ def __del__(self):
168
+ """Automatically stop server when object is destroyed if auto_stop is True."""
169
+ if self.auto_stop and self._server_context is not None:
170
+ self.stop()
171
+
172
+
173
+ class DockerConfigRegistry:
174
+ """Registry for mapping model names to their Docker configurations."""
175
+
176
+ def __init__(self):
177
+ self._registry = dict()
178
+
179
+ def register(
180
+ self, model_name: str, config_factory: Callable[[], DockerServerConfig | None]
181
+ ):
182
+ """Register a config factory for a model name."""
183
+ self._registry[model_name] = config_factory
184
+
185
+ def get(self, model_name: str, default=False) -> DockerServerConfig | None:
186
+ """Get config for a model name. Returns default if not registered."""
187
+ if model_name not in self._registry:
188
+ if default:
189
+ return VLLMDockerServerConfig(model_name=model_name)
190
+ return None
191
+ return self._registry[model_name]()
192
+
193
+ def list_models(self) -> list[str]:
194
+ """List all registered model names."""
195
+ return list(self._registry.keys())
196
+
197
+
198
+ # Global registry instance
199
+ docker_config_registry = DockerConfigRegistry()
@@ -0,0 +1,250 @@
1
+ import getpass
2
+ import time
3
+ from contextlib import contextmanager
4
+ from pathlib import Path
5
+
6
+ from loguru import logger
7
+
8
+ import docker
9
+
10
+
11
+ def _ensure_image_exists(
12
+ client: docker.DockerClient,
13
+ image: str,
14
+ dockerfile_path: Path,
15
+ ):
16
+ """Check if image exists, build it if not."""
17
+ try:
18
+ client.images.get(image)
19
+ logger.info(f"Docker image {image} found")
20
+ return
21
+ except docker.errors.ImageNotFound:
22
+ logger.info(f"Docker image {image} not found, building...")
23
+
24
+ if not dockerfile_path.exists():
25
+ raise FileNotFoundError(
26
+ f"Dockerfile directory not found at {dockerfile_path}"
27
+ ) from None
28
+
29
+ logger.info(f"Building image from {dockerfile_path}")
30
+
31
+ # Use low-level API for real-time streaming
32
+ api_client = docker.APIClient(base_url="unix://var/run/docker.sock")
33
+
34
+ # Build the image with streaming
35
+ build_stream = api_client.build(
36
+ path=str(dockerfile_path),
37
+ tag=image,
38
+ rm=True,
39
+ decode=True, # Automatically decode JSON responses to dict
40
+ )
41
+
42
+ # Stream build logs in real-time
43
+ for chunk in build_stream:
44
+ if "stream" in chunk:
45
+ for line in chunk["stream"].splitlines():
46
+ logger.info(line)
47
+ elif "error" in chunk:
48
+ logger.error(chunk["error"])
49
+ raise docker.errors.BuildError(chunk["error"], build_stream) from None
50
+ elif "status" in chunk:
51
+ # Handle status updates (e.g., downloading layers)
52
+ logger.debug(chunk["status"])
53
+
54
+ logger.info(f"Successfully built image {image}")
55
+
56
+
57
+ @contextmanager
58
+ def docker_server(
59
+ config: "DockerServerConfig", # noqa: F821
60
+ timeout: int = 1000,
61
+ cleanup: bool = True,
62
+ ):
63
+ """Generic context manager for Docker server deployment.
64
+
65
+ Args:
66
+ config: DockerServerConfig (can be VLLMDockerServerConfig or GenericDockerServerConfig)
67
+ timeout: Timeout in seconds to wait for server to be ready
68
+ cleanup: If True, stop and remove container on exit. If False, leave container running
69
+
70
+ Yields:
71
+ tuple: (base_url, container) - The base URL of the server and the Docker container object
72
+ """
73
+
74
+ client = docker.from_env()
75
+ container = None
76
+
77
+ try:
78
+ # Ensure image exists
79
+ logger.info(f"Checking for Docker image {config.docker_image}...")
80
+
81
+ if config.dockerfile_dir is not None:
82
+ _ensure_image_exists(
83
+ client, config.docker_image, Path(config.dockerfile_dir)
84
+ )
85
+ else:
86
+ # Pull pre-built image
87
+ try:
88
+ client.images.get(config.docker_image)
89
+ logger.info(f"Docker image {config.docker_image} found locally")
90
+ except docker.errors.ImageNotFound:
91
+ logger.info(
92
+ f"Docker image {config.docker_image} not found locally, pulling..."
93
+ )
94
+ client.images.pull(config.docker_image)
95
+ logger.info(f"Successfully pulled {config.docker_image}")
96
+
97
+ logger.info(
98
+ f"Starting Docker container for {config.model_name} on port {config.docker_port}"
99
+ )
100
+
101
+ # Configure GPU access
102
+ device_requests = None
103
+
104
+ if config.gpu_device_ids is None:
105
+ # Default: Try to use all GPUs if available
106
+ device_requests = [
107
+ docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
108
+ ]
109
+ elif len(config.gpu_device_ids) > 0 and config.gpu_device_ids[0] != "":
110
+ # Use specific GPU devices
111
+ device_requests = [
112
+ docker.types.DeviceRequest(
113
+ device_ids=config.gpu_device_ids, capabilities=[["gpu"]]
114
+ )
115
+ ]
116
+ else:
117
+ # Empty list means CPU-only, no GPU
118
+ device_requests = None
119
+
120
+ # Use generic methods from config
121
+ command = config.get_command()
122
+ volumes = config.get_volumes()
123
+ environment = config.get_environment()
124
+ container_port = config.container_port
125
+ log_prefix = config.model_name
126
+
127
+ # Construct URI for label
128
+ uri = f"http://localhost:{config.docker_port}{config.get_base_url_suffix()}"
129
+
130
+ # Determine GPU label
131
+ if config.gpu_device_ids is None:
132
+ gpu_label = "all"
133
+ elif len(config.gpu_device_ids) == 0 or (
134
+ len(config.gpu_device_ids) == 1 and config.gpu_device_ids[0] == ""
135
+ ):
136
+ gpu_label = "cpu"
137
+ else:
138
+ gpu_label = ",".join(config.gpu_device_ids)
139
+
140
+ # Start container
141
+ container_kwargs = {
142
+ "image": config.docker_image,
143
+ "ports": {f"{container_port}/tcp": config.docker_port},
144
+ "detach": True,
145
+ "remove": True,
146
+ "name": f"vlmparse-{config.model_name.replace('/', '-')}-{getpass.getuser()}",
147
+ "labels": {
148
+ "vlmparse_model_name": config.model_name,
149
+ "vlmparse_uri": uri,
150
+ "vlmparse_gpus": gpu_label,
151
+ },
152
+ }
153
+
154
+ if device_requests is not None:
155
+ container_kwargs["device_requests"] = device_requests
156
+ if command:
157
+ container_kwargs["command"] = command
158
+ if environment:
159
+ container_kwargs["environment"] = environment
160
+ if volumes:
161
+ container_kwargs["volumes"] = volumes
162
+ if config.entrypoint:
163
+ container_kwargs["entrypoint"] = config.entrypoint
164
+
165
+ container = client.containers.run(**container_kwargs)
166
+
167
+ logger.info(
168
+ f"Container {container.short_id} started, waiting for server to be ready..."
169
+ )
170
+
171
+ # Wait for server to be ready
172
+ start_time = time.time()
173
+ server_ready = False
174
+ last_log_position = 0
175
+
176
+ while time.time() - start_time < timeout:
177
+ try:
178
+ container.reload()
179
+ except docker.errors.NotFound as e:
180
+ logger.error("Container stopped unexpectedly during startup")
181
+ raise RuntimeError(
182
+ "Container crashed during initialization. Check Docker logs for details."
183
+ ) from e
184
+
185
+ if container.status == "running":
186
+ # Get all logs and display new ones
187
+ all_logs = container.logs().decode("utf-8")
188
+
189
+ # Display new log lines
190
+ if len(all_logs) > last_log_position:
191
+ new_logs = all_logs[last_log_position:]
192
+ for line in new_logs.splitlines():
193
+ if line.strip(): # Only print non-empty lines
194
+ logger.info(f"[{log_prefix}] {line}")
195
+ last_log_position = len(all_logs)
196
+
197
+ # Check if server is ready
198
+ for indicator in config.server_ready_indicators:
199
+ if indicator in all_logs:
200
+ server_ready = True
201
+ if server_ready:
202
+ logger.info(f"Server ready indicator '{indicator}' found in logs")
203
+ break
204
+
205
+ time.sleep(2)
206
+
207
+ if not server_ready:
208
+ raise TimeoutError(f"Server did not become ready within {timeout} seconds")
209
+
210
+ # Build base URL using config's suffix method
211
+ base_url = (
212
+ f"http://localhost:{config.docker_port}{config.get_base_url_suffix()}"
213
+ )
214
+
215
+ logger.info(f"{log_prefix} server ready at {base_url}")
216
+
217
+ yield base_url, container
218
+
219
+ finally:
220
+ if cleanup and container:
221
+ logger.info(f"Stopping container {container.short_id}")
222
+ container.stop(timeout=10)
223
+ logger.info("Container stopped")
224
+
225
+
226
+ def get_model_from_uri(uri: str) -> str:
227
+ model = None
228
+ client = docker.from_env()
229
+ containers = client.containers.list()
230
+ for container in containers:
231
+ c_uri = container.labels.get("vlmparse_uri")
232
+ c_model = container.labels.get("vlmparse_model_name")
233
+
234
+ # Check if user URI matches container URI (ignoring /v1 suffix if missing)
235
+ if c_uri and (
236
+ c_uri == uri or c_uri.startswith(uri.rstrip("/")) or uri.startswith(c_uri)
237
+ ):
238
+ # Update URI to the correct one from container (likely has /v1)
239
+ if len(c_uri) > len(uri.rstrip("/")):
240
+ logger.info(f"Updating URI from {uri} to {c_uri}")
241
+ uri = c_uri
242
+
243
+ # Infer model if not provided
244
+ if model is None and c_model:
245
+ logger.info(f"Inferred model {c_model} from container")
246
+ model = c_model
247
+ break
248
+ if model is None:
249
+ raise ValueError(f"No model found for URI {uri}")
250
+ return model
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import glob
4
+ import os
5
+ from typing import List, Optional
6
+
7
+ import streamlit as st
8
+
9
+
10
+ def get_gz_files_count(folder_path: str) -> int:
11
+ return len(glob.glob(os.path.join(folder_path, "*.json*")))
12
+
13
+
14
+ def get_subdirectories(path: str) -> List[str]:
15
+ return sorted([d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))])
16
+
17
+
18
+ def file_selector(root_folder: str) -> Optional[str]:
19
+ st.title("Folder Navigation")
20
+ if not root_folder or not os.path.exists(root_folder):
21
+ return None
22
+
23
+ current_path = selected_path = root_folder
24
+ level = 0
25
+ while True:
26
+ subdirs = get_subdirectories(current_path)
27
+ if not subdirs:
28
+ break
29
+ dir_options = [
30
+ f"{d} ({get_gz_files_count(os.path.join(current_path, d))} .zip files)"
31
+ for d in subdirs
32
+ ]
33
+ selected = st.selectbox(
34
+ f"Level {level} Selection",
35
+ ["--Select--"] + dir_options,
36
+ key=f"level_{level}",
37
+ )
38
+ if selected == "--Select--" or not selected:
39
+ break
40
+ selected_dir = selected.split(" (", 1)[0]
41
+ current_path = os.path.join(current_path, selected_dir)
42
+ selected_path = current_path
43
+ level += 1
44
+
45
+ gz_files = sorted(glob.glob(os.path.join(selected_path, "*.zip")))
46
+ if gz_files:
47
+ selected_file = st.selectbox(
48
+ "Select .zip file",
49
+ ["--Select--"] + [os.path.basename(f) for f in gz_files],
50
+ )
51
+ if selected_file != "--Select--":
52
+ return os.path.join(selected_path, selected_file)
53
+ return None