convoviz 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- convoviz/__init__.py +34 -0
- convoviz/__main__.py +6 -0
- convoviz/analysis/__init__.py +22 -0
- convoviz/analysis/graphs.py +879 -0
- convoviz/analysis/wordcloud.py +204 -0
- convoviz/assets/colormaps.txt +15 -0
- convoviz/assets/fonts/AmaticSC-Regular.ttf +0 -0
- convoviz/assets/fonts/ArchitectsDaughter-Regular.ttf +0 -0
- convoviz/assets/fonts/BebasNeue-Regular.ttf +0 -0
- convoviz/assets/fonts/Borel-Regular.ttf +0 -0
- convoviz/assets/fonts/Courgette-Regular.ttf +0 -0
- convoviz/assets/fonts/CroissantOne-Regular.ttf +0 -0
- convoviz/assets/fonts/Handjet-Regular.ttf +0 -0
- convoviz/assets/fonts/IndieFlower-Regular.ttf +0 -0
- convoviz/assets/fonts/Kalam-Regular.ttf +0 -0
- convoviz/assets/fonts/Lobster-Regular.ttf +0 -0
- convoviz/assets/fonts/MartianMono-Regular.ttf +0 -0
- convoviz/assets/fonts/MartianMono-Thin.ttf +0 -0
- convoviz/assets/fonts/Montserrat-Regular.ttf +0 -0
- convoviz/assets/fonts/Mooli-Regular.ttf +0 -0
- convoviz/assets/fonts/Pacifico-Regular.ttf +0 -0
- convoviz/assets/fonts/PlayfairDisplay-Regular.ttf +0 -0
- convoviz/assets/fonts/Raleway-Regular.ttf +0 -0
- convoviz/assets/fonts/RobotoMono-Regular.ttf +0 -0
- convoviz/assets/fonts/RobotoMono-Thin.ttf +0 -0
- convoviz/assets/fonts/RobotoSlab-Regular.ttf +0 -0
- convoviz/assets/fonts/RobotoSlab-Thin.ttf +0 -0
- convoviz/assets/fonts/Ruwudu-Regular.ttf +0 -0
- convoviz/assets/fonts/Sacramento-Regular.ttf +0 -0
- convoviz/assets/fonts/SedgwickAveDisplay-Regular.ttf +0 -0
- convoviz/assets/fonts/ShadowsIntoLight-Regular.ttf +0 -0
- convoviz/assets/fonts/TitilliumWeb-Regular.ttf +0 -0
- convoviz/assets/fonts/Yellowtail-Regular.ttf +0 -0
- convoviz/assets/fonts/YsabeauOffice-Regular.ttf +0 -0
- convoviz/assets/fonts/YsabeauSC-Regular.ttf +0 -0
- convoviz/assets/fonts/YsabeauSC-Thin.ttf +0 -0
- convoviz/assets/fonts/Zeyada-Regular.ttf +0 -0
- convoviz/assets/stopwords.txt +1 -0
- convoviz/cli.py +149 -0
- convoviz/config.py +120 -0
- convoviz/exceptions.py +47 -0
- convoviz/interactive.py +264 -0
- convoviz/io/__init__.py +21 -0
- convoviz/io/assets.py +109 -0
- convoviz/io/loaders.py +191 -0
- convoviz/io/writers.py +231 -0
- convoviz/logging_config.py +69 -0
- convoviz/models/__init__.py +24 -0
- convoviz/models/collection.py +115 -0
- convoviz/models/conversation.py +158 -0
- convoviz/models/message.py +218 -0
- convoviz/models/node.py +66 -0
- convoviz/pipeline.py +184 -0
- convoviz/py.typed +0 -0
- convoviz/renderers/__init__.py +10 -0
- convoviz/renderers/markdown.py +269 -0
- convoviz/renderers/yaml.py +119 -0
- convoviz/utils.py +155 -0
- convoviz-0.4.1.dist-info/METADATA +215 -0
- convoviz-0.4.1.dist-info/RECORD +62 -0
- convoviz-0.4.1.dist-info/WHEEL +4 -0
- convoviz-0.4.1.dist-info/entry_points.txt +3 -0
convoviz/interactive.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Interactive configuration prompts using questionary."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal, Protocol, cast
|
|
6
|
+
|
|
7
|
+
from questionary import Choice, Style, checkbox, select
|
|
8
|
+
from questionary import path as qst_path
|
|
9
|
+
from questionary import text as qst_text
|
|
10
|
+
|
|
11
|
+
from convoviz.config import ConvovizConfig, OutputKind, get_default_config
|
|
12
|
+
from convoviz.io.loaders import find_latest_zip, validate_zip
|
|
13
|
+
from convoviz.utils import colormaps, default_font_path, font_names, font_path, validate_header
|
|
14
|
+
|
|
15
|
+
CUSTOM_STYLE = Style(
|
|
16
|
+
[
|
|
17
|
+
("qmark", "fg:#34eb9b bold"),
|
|
18
|
+
("question", "bold fg:#e0e0e0"),
|
|
19
|
+
("answer", "fg:#34ebeb bold"),
|
|
20
|
+
("pointer", "fg:#e834eb bold"),
|
|
21
|
+
("highlighted", "fg:#349ceb bold"),
|
|
22
|
+
("selected", "fg:#34ebeb"),
|
|
23
|
+
("separator", "fg:#eb3434"),
|
|
24
|
+
("instruction", "fg:#eb9434"),
|
|
25
|
+
("text", "fg:#b2eb34"),
|
|
26
|
+
("disabled", "fg:#858585 italic"),
|
|
27
|
+
]
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _QuestionaryPrompt[T](Protocol):
|
|
34
|
+
def ask(self) -> T | None: ...
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _ask_or_cancel[T](prompt: _QuestionaryPrompt[T]) -> T:
|
|
38
|
+
"""Ask a questionary prompt; treat Ctrl+C/Ctrl+D as cancelling the run.
|
|
39
|
+
|
|
40
|
+
questionary's `.ask()` returns `None` on cancellation (Ctrl+C / Ctrl+D). We
|
|
41
|
+
convert that to `KeyboardInterrupt` so callers can abort the whole
|
|
42
|
+
interactive session with a single Ctrl+C.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
result = prompt.ask()
|
|
46
|
+
if result is None:
|
|
47
|
+
raise KeyboardInterrupt
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _validate_input_path(raw: str) -> bool | str:
|
|
52
|
+
path = Path(raw)
|
|
53
|
+
if not path.exists():
|
|
54
|
+
return "Path must exist"
|
|
55
|
+
|
|
56
|
+
if path.is_dir():
|
|
57
|
+
if (path / "conversations.json").exists():
|
|
58
|
+
return True
|
|
59
|
+
return "Directory must contain conversations.json"
|
|
60
|
+
|
|
61
|
+
if path.suffix.lower() == ".json":
|
|
62
|
+
return True
|
|
63
|
+
|
|
64
|
+
if path.suffix.lower() == ".zip":
|
|
65
|
+
return True if validate_zip(path) else "ZIP must contain conversations.json"
|
|
66
|
+
|
|
67
|
+
return "Input must be a .zip, a .json, or a directory containing conversations.json"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run_interactive_config(initial_config: ConvovizConfig | None = None) -> ConvovizConfig:
|
|
71
|
+
"""Run interactive prompts to configure convoviz.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
initial_config: Optional starting configuration (uses defaults if None)
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Updated configuration based on user input
|
|
78
|
+
"""
|
|
79
|
+
config = initial_config or get_default_config()
|
|
80
|
+
logger.info("Starting interactive configuration")
|
|
81
|
+
|
|
82
|
+
# Set sensible defaults if not already set
|
|
83
|
+
if not config.input_path:
|
|
84
|
+
latest = find_latest_zip()
|
|
85
|
+
if latest:
|
|
86
|
+
config.input_path = latest
|
|
87
|
+
|
|
88
|
+
if not config.wordcloud.font_path:
|
|
89
|
+
config.wordcloud.font_path = default_font_path()
|
|
90
|
+
|
|
91
|
+
# Prompt for input path
|
|
92
|
+
input_default = str(config.input_path) if config.input_path else ""
|
|
93
|
+
input_result: str = _ask_or_cancel(
|
|
94
|
+
qst_path(
|
|
95
|
+
"Enter the path to the export ZIP, conversations JSON, or extracted directory:",
|
|
96
|
+
default=input_default,
|
|
97
|
+
validate=_validate_input_path,
|
|
98
|
+
style=CUSTOM_STYLE,
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if input_result:
|
|
103
|
+
config.input_path = Path(input_result)
|
|
104
|
+
logger.debug(f"User selected input: {config.input_path}")
|
|
105
|
+
|
|
106
|
+
# Prompt for output folder
|
|
107
|
+
output_result: str = _ask_or_cancel(
|
|
108
|
+
qst_path(
|
|
109
|
+
"Enter the path to the output folder:",
|
|
110
|
+
default=str(config.output_folder),
|
|
111
|
+
style=CUSTOM_STYLE,
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if output_result:
|
|
116
|
+
config.output_folder = Path(output_result)
|
|
117
|
+
logger.debug(f"User selected output: {config.output_folder}")
|
|
118
|
+
|
|
119
|
+
# Prompt for outputs to generate
|
|
120
|
+
output_choices = [
|
|
121
|
+
Choice(title="Markdown conversations", value=OutputKind.MARKDOWN, checked=True),
|
|
122
|
+
Choice(title="Graphs (usage analytics)", value=OutputKind.GRAPHS, checked=True),
|
|
123
|
+
Choice(title="Word clouds", value=OutputKind.WORDCLOUDS, checked=True),
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
selected_outputs: list[OutputKind] = _ask_or_cancel(
|
|
127
|
+
checkbox(
|
|
128
|
+
"Select outputs to generate:",
|
|
129
|
+
choices=output_choices,
|
|
130
|
+
style=CUSTOM_STYLE,
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
config.outputs = set(selected_outputs) if selected_outputs else set()
|
|
135
|
+
logger.debug(f"User selected outputs: {config.outputs}")
|
|
136
|
+
|
|
137
|
+
# Prompt for markdown settings (only if markdown output is selected)
|
|
138
|
+
if OutputKind.MARKDOWN in config.outputs:
|
|
139
|
+
# Prompt for author headers
|
|
140
|
+
headers = config.message.author_headers
|
|
141
|
+
for role in ["user", "assistant"]:
|
|
142
|
+
current = getattr(headers, role)
|
|
143
|
+
result: str = _ask_or_cancel(
|
|
144
|
+
qst_text(
|
|
145
|
+
f"Enter the message header for '{role}':",
|
|
146
|
+
default=current,
|
|
147
|
+
validate=lambda t: validate_header(t)
|
|
148
|
+
or "Must be a valid markdown header (e.g., # Title)",
|
|
149
|
+
style=CUSTOM_STYLE,
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
if result:
|
|
153
|
+
setattr(headers, role, result)
|
|
154
|
+
logger.debug(f"User selected headers: {headers}")
|
|
155
|
+
|
|
156
|
+
# Prompt for markdown flavor
|
|
157
|
+
flavor_result = cast(
|
|
158
|
+
Literal["standard", "obsidian"],
|
|
159
|
+
_ask_or_cancel(
|
|
160
|
+
select(
|
|
161
|
+
"Select the markdown flavor:",
|
|
162
|
+
choices=["standard", "obsidian"],
|
|
163
|
+
default=config.conversation.markdown.flavor,
|
|
164
|
+
style=CUSTOM_STYLE,
|
|
165
|
+
)
|
|
166
|
+
),
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if flavor_result:
|
|
170
|
+
config.conversation.markdown.flavor = flavor_result
|
|
171
|
+
logger.debug(f"User selected flavor: {config.conversation.markdown.flavor}")
|
|
172
|
+
|
|
173
|
+
# Prompt for YAML headers
|
|
174
|
+
yaml_config = config.conversation.yaml
|
|
175
|
+
yaml_choices = [
|
|
176
|
+
Choice(title=field, checked=getattr(yaml_config, field))
|
|
177
|
+
for field in [
|
|
178
|
+
"title",
|
|
179
|
+
"tags",
|
|
180
|
+
"chat_link",
|
|
181
|
+
"create_time",
|
|
182
|
+
"update_time",
|
|
183
|
+
"model",
|
|
184
|
+
"used_plugins",
|
|
185
|
+
"message_count",
|
|
186
|
+
"content_types",
|
|
187
|
+
"custom_instructions",
|
|
188
|
+
]
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
selected: list[str] = _ask_or_cancel(
|
|
192
|
+
checkbox(
|
|
193
|
+
"Select YAML metadata headers to include:",
|
|
194
|
+
choices=yaml_choices,
|
|
195
|
+
style=CUSTOM_STYLE,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
selected_set = set(selected)
|
|
200
|
+
for field_name in [
|
|
201
|
+
"title",
|
|
202
|
+
"tags",
|
|
203
|
+
"chat_link",
|
|
204
|
+
"create_time",
|
|
205
|
+
"update_time",
|
|
206
|
+
"model",
|
|
207
|
+
"used_plugins",
|
|
208
|
+
"message_count",
|
|
209
|
+
"content_types",
|
|
210
|
+
"custom_instructions",
|
|
211
|
+
]:
|
|
212
|
+
setattr(yaml_config, field_name, field_name in selected_set)
|
|
213
|
+
|
|
214
|
+
# Prompt for wordcloud settings (only if wordclouds output is selected)
|
|
215
|
+
if OutputKind.WORDCLOUDS in config.outputs:
|
|
216
|
+
# Prompt for font
|
|
217
|
+
available_fonts = font_names()
|
|
218
|
+
if available_fonts:
|
|
219
|
+
current_font = (
|
|
220
|
+
config.wordcloud.font_path.stem
|
|
221
|
+
if config.wordcloud.font_path
|
|
222
|
+
else available_fonts[0]
|
|
223
|
+
)
|
|
224
|
+
font_result: str = _ask_or_cancel(
|
|
225
|
+
select(
|
|
226
|
+
"Select the font for word clouds:",
|
|
227
|
+
choices=available_fonts,
|
|
228
|
+
default=current_font if current_font in available_fonts else available_fonts[0],
|
|
229
|
+
style=CUSTOM_STYLE,
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
if font_result:
|
|
234
|
+
config.wordcloud.font_path = font_path(font_result)
|
|
235
|
+
|
|
236
|
+
# Prompt for colormap
|
|
237
|
+
available_colormaps = colormaps()
|
|
238
|
+
if available_colormaps:
|
|
239
|
+
colormap_result: str = _ask_or_cancel(
|
|
240
|
+
select(
|
|
241
|
+
"Select the color theme for word clouds:",
|
|
242
|
+
choices=available_colormaps,
|
|
243
|
+
default=config.wordcloud.colormap
|
|
244
|
+
if config.wordcloud.colormap in available_colormaps
|
|
245
|
+
else available_colormaps[0],
|
|
246
|
+
style=CUSTOM_STYLE,
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if colormap_result:
|
|
251
|
+
config.wordcloud.colormap = colormap_result
|
|
252
|
+
|
|
253
|
+
# Prompt for custom stopwords
|
|
254
|
+
stopwords_result: str = _ask_or_cancel(
|
|
255
|
+
qst_text(
|
|
256
|
+
"Enter custom stopwords (comma-separated):",
|
|
257
|
+
default=config.wordcloud.custom_stopwords,
|
|
258
|
+
style=CUSTOM_STYLE,
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
config.wordcloud.custom_stopwords = stopwords_result
|
|
263
|
+
|
|
264
|
+
return config
|
convoviz/io/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""I/O operations for convoviz."""
|
|
2
|
+
|
|
3
|
+
from convoviz.io.loaders import (
|
|
4
|
+
load_collection_from_json,
|
|
5
|
+
load_collection_from_zip,
|
|
6
|
+
load_conversation_from_json,
|
|
7
|
+
)
|
|
8
|
+
from convoviz.io.writers import (
|
|
9
|
+
save_collection,
|
|
10
|
+
save_conversation,
|
|
11
|
+
save_custom_instructions,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"load_collection_from_json",
|
|
16
|
+
"load_collection_from_zip",
|
|
17
|
+
"load_conversation_from_json",
|
|
18
|
+
"save_collection",
|
|
19
|
+
"save_conversation",
|
|
20
|
+
"save_custom_instructions",
|
|
21
|
+
]
|
convoviz/io/assets.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"Asset management functions."
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def resolve_asset_path(source_dir: Path, asset_id: str) -> Path | None:
|
|
11
|
+
"""Find the actual file for a given asset ID in the source directory.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
source_dir: Directory to search in
|
|
15
|
+
asset_id: The asset ID (e.g., "file-uuid")
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Path to the found file, or None
|
|
19
|
+
"""
|
|
20
|
+
if not source_dir.exists():
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
source_dir = source_dir.resolve()
|
|
24
|
+
|
|
25
|
+
# Safety check for asset_id
|
|
26
|
+
if ".." in asset_id or "/" in asset_id or "\\" in asset_id:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
# 1. Try exact match
|
|
30
|
+
exact_path = (source_dir / asset_id).resolve()
|
|
31
|
+
if exact_path.exists() and exact_path.is_file() and exact_path.is_relative_to(source_dir):
|
|
32
|
+
logger.debug(f"Resolved asset (exact): {asset_id} -> {exact_path}")
|
|
33
|
+
return exact_path
|
|
34
|
+
|
|
35
|
+
# 2. Try prefix match in root
|
|
36
|
+
try:
|
|
37
|
+
candidates = list(source_dir.glob(f"{asset_id}*"))
|
|
38
|
+
files = [
|
|
39
|
+
p.resolve()
|
|
40
|
+
for p in candidates
|
|
41
|
+
if p.is_file() and p.resolve().is_relative_to(source_dir)
|
|
42
|
+
]
|
|
43
|
+
if files:
|
|
44
|
+
logger.debug(f"Resolved asset (prefix root): {asset_id} -> {files[0]}")
|
|
45
|
+
return files[0]
|
|
46
|
+
except Exception:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
# 3. Try prefix match in dalle-generations
|
|
50
|
+
dalle_dir = source_dir / "dalle-generations"
|
|
51
|
+
if dalle_dir.exists() and dalle_dir.is_dir():
|
|
52
|
+
dalle_dir = dalle_dir.resolve()
|
|
53
|
+
try:
|
|
54
|
+
candidates = list(dalle_dir.glob(f"{asset_id}*"))
|
|
55
|
+
files = [
|
|
56
|
+
p.resolve()
|
|
57
|
+
for p in candidates
|
|
58
|
+
if p.is_file() and p.resolve().is_relative_to(dalle_dir)
|
|
59
|
+
]
|
|
60
|
+
if files:
|
|
61
|
+
logger.debug(f"Resolved asset (dalle): {asset_id} -> {files[0]}")
|
|
62
|
+
return files[0]
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
# 4. Try prefix match in user-* directories (new 2025 format)
|
|
67
|
+
try:
|
|
68
|
+
for user_dir in source_dir.glob("user-*"):
|
|
69
|
+
if user_dir.is_dir():
|
|
70
|
+
user_dir = user_dir.resolve()
|
|
71
|
+
candidates = list(user_dir.glob(f"{asset_id}*"))
|
|
72
|
+
files = [
|
|
73
|
+
p.resolve()
|
|
74
|
+
for p in candidates
|
|
75
|
+
if p.is_file() and p.resolve().is_relative_to(user_dir)
|
|
76
|
+
]
|
|
77
|
+
if files:
|
|
78
|
+
logger.debug(f"Resolved asset (user dir): {asset_id} -> {files[0]}")
|
|
79
|
+
return files[0]
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def copy_asset(source_path: Path, dest_dir: Path) -> str:
|
|
87
|
+
"""Copy an asset to the destination directory.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
source_path: The source file path
|
|
91
|
+
dest_dir: The root output directory (assets will be in dest_dir/assets)
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Relative path to the asset (e.g., "assets/image.png")
|
|
95
|
+
"""
|
|
96
|
+
assets_dir = dest_dir / "assets"
|
|
97
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
98
|
+
|
|
99
|
+
dest_path = assets_dir / source_path.name
|
|
100
|
+
|
|
101
|
+
if not dest_path.exists():
|
|
102
|
+
try:
|
|
103
|
+
shutil.copy2(source_path, dest_path)
|
|
104
|
+
logger.debug(f"Copied asset: {source_path.name}")
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.warning(f"Failed to copy asset {source_path}: {e}")
|
|
107
|
+
|
|
108
|
+
# Return forward-slash path for Markdown compatibility even on Windows
|
|
109
|
+
return f"assets/{source_path.name}"
|
convoviz/io/loaders.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Loading functions for conversations and collections."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path, PurePosixPath
|
|
5
|
+
from zipfile import ZipFile
|
|
6
|
+
|
|
7
|
+
from orjson import loads
|
|
8
|
+
|
|
9
|
+
from convoviz.exceptions import InvalidZipError
|
|
10
|
+
from convoviz.models import Conversation, ConversationCollection
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _is_safe_zip_member_name(name: str) -> bool:
|
|
16
|
+
"""Return True if a ZIP entry name is safe to extract.
|
|
17
|
+
|
|
18
|
+
This is intentionally OS-agnostic: it treats both ``/`` and ``\\`` as path
|
|
19
|
+
separators and rejects absolute paths, drive-letter paths, and ``..`` parts.
|
|
20
|
+
"""
|
|
21
|
+
normalized = name.replace("\\", "/")
|
|
22
|
+
member_path = PurePosixPath(normalized)
|
|
23
|
+
|
|
24
|
+
# Absolute paths (e.g. "/etc/passwd") or empty names
|
|
25
|
+
if not normalized or member_path.is_absolute():
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
# Windows drive letters / UNC-style prefixes stored in the archive
|
|
29
|
+
first = member_path.parts[0] if member_path.parts else ""
|
|
30
|
+
if first.endswith(":") or first.startswith("//") or first.startswith("\\\\"):
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
return ".." not in member_path.parts
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_archive(filepath: Path) -> Path:
|
|
37
|
+
"""Extract a ZIP file and return the extraction folder path.
|
|
38
|
+
|
|
39
|
+
Includes safety checks to prevent Path Traversal (Zip-Slip).
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
filepath: Path to the ZIP file
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Path to the extracted folder
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
InvalidZipError: If extraction fails or a security risk is detected
|
|
49
|
+
"""
|
|
50
|
+
folder = filepath.with_suffix("")
|
|
51
|
+
folder.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
logger.info(f"Extracting archive: {filepath} to {folder}")
|
|
53
|
+
|
|
54
|
+
with ZipFile(filepath) as zf:
|
|
55
|
+
for member in zf.infolist():
|
|
56
|
+
# Check for path traversal (Zip-Slip) in an OS-agnostic way.
|
|
57
|
+
# ZIP files are typically POSIX-path-like, but malicious archives can
|
|
58
|
+
# embed backslashes or drive-letter tricks.
|
|
59
|
+
if not _is_safe_zip_member_name(member.filename):
|
|
60
|
+
raise InvalidZipError(
|
|
61
|
+
str(filepath), reason=f"Malicious path in ZIP: {member.filename}"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Additional check using resolved paths
|
|
65
|
+
normalized = member.filename.replace("\\", "/")
|
|
66
|
+
target_path = (folder / normalized).resolve()
|
|
67
|
+
if not target_path.is_relative_to(folder.resolve()):
|
|
68
|
+
raise InvalidZipError(
|
|
69
|
+
str(filepath), reason=f"Malicious path in ZIP: {member.filename}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
zf.extractall(folder)
|
|
73
|
+
return folder
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def validate_zip(filepath: Path) -> bool:
|
|
77
|
+
"""Check if a ZIP file contains conversations.json.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
filepath: Path to the ZIP file
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
True if valid, False otherwise
|
|
84
|
+
"""
|
|
85
|
+
if not filepath.is_file() or filepath.suffix != ".zip":
|
|
86
|
+
return False
|
|
87
|
+
try:
|
|
88
|
+
with ZipFile(filepath) as zf:
|
|
89
|
+
return "conversations.json" in zf.namelist()
|
|
90
|
+
except Exception:
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def load_conversation_from_json(filepath: Path | str) -> Conversation:
|
|
95
|
+
"""Load a single conversation from a JSON file.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
filepath: Path to the JSON file
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Loaded Conversation object
|
|
102
|
+
"""
|
|
103
|
+
filepath = Path(filepath)
|
|
104
|
+
with filepath.open(encoding="utf-8") as f:
|
|
105
|
+
data = loads(f.read())
|
|
106
|
+
return Conversation(**data)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def load_collection_from_json(filepath: Path | str) -> ConversationCollection:
|
|
110
|
+
"""Load a conversation collection from a JSON file.
|
|
111
|
+
|
|
112
|
+
The JSON file should contain an array of conversation objects,
|
|
113
|
+
or an object with a "conversations" key.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
filepath: Path to the JSON file
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Loaded ConversationCollection object
|
|
120
|
+
"""
|
|
121
|
+
filepath = Path(filepath)
|
|
122
|
+
logger.debug(f"Loading collection from JSON: {filepath}")
|
|
123
|
+
with filepath.open(encoding="utf-8") as f:
|
|
124
|
+
data = loads(f.read())
|
|
125
|
+
|
|
126
|
+
# Handle case where export is wrapped in a top-level object
|
|
127
|
+
if isinstance(data, dict) and "conversations" in data:
|
|
128
|
+
data = data["conversations"]
|
|
129
|
+
|
|
130
|
+
return ConversationCollection(conversations=data, source_path=filepath.parent)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def load_collection_from_zip(filepath: Path | str) -> ConversationCollection:
|
|
134
|
+
"""Load a conversation collection from a ChatGPT export ZIP file.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
filepath: Path to the ZIP file
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Loaded ConversationCollection object
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
InvalidZipError: If the ZIP file is invalid or missing conversations.json
|
|
144
|
+
"""
|
|
145
|
+
filepath = Path(filepath)
|
|
146
|
+
|
|
147
|
+
if not validate_zip(filepath):
|
|
148
|
+
raise InvalidZipError(str(filepath))
|
|
149
|
+
|
|
150
|
+
extracted_folder = extract_archive(filepath)
|
|
151
|
+
conversations_path = extracted_folder / "conversations.json"
|
|
152
|
+
|
|
153
|
+
return load_collection_from_json(conversations_path)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def find_latest_zip(directory: Path | None = None) -> Path | None:
|
|
157
|
+
"""Find the most recently created ZIP file in a directory.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
directory: Directory to search (defaults to ~/Downloads)
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Path to the most recent ZIP, or None if none found
|
|
164
|
+
"""
|
|
165
|
+
if directory is None:
|
|
166
|
+
directory = Path.home() / "Downloads"
|
|
167
|
+
|
|
168
|
+
zip_files = list(directory.glob("*.zip"))
|
|
169
|
+
if not zip_files:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
return max(zip_files, key=lambda p: p.stat().st_ctime)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def find_latest_bookmarklet_json(directory: Path | None = None) -> Path | None:
|
|
176
|
+
"""Find the most recent bookmarklet JSON file in a directory.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
directory: Directory to search (defaults to ~/Downloads)
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Path to the most recent bookmarklet JSON, or None if none found
|
|
183
|
+
"""
|
|
184
|
+
if directory is None:
|
|
185
|
+
directory = Path.home() / "Downloads"
|
|
186
|
+
|
|
187
|
+
bookmarklet_files = [f for f in directory.glob("*.json") if "bookmarklet" in f.name.lower()]
|
|
188
|
+
if not bookmarklet_files:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
return max(bookmarklet_files, key=lambda p: p.stat().st_ctime)
|