convoviz 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. convoviz/__init__.py +34 -0
  2. convoviz/__main__.py +6 -0
  3. convoviz/analysis/__init__.py +22 -0
  4. convoviz/analysis/graphs.py +879 -0
  5. convoviz/analysis/wordcloud.py +204 -0
  6. convoviz/assets/colormaps.txt +15 -0
  7. convoviz/assets/fonts/AmaticSC-Regular.ttf +0 -0
  8. convoviz/assets/fonts/ArchitectsDaughter-Regular.ttf +0 -0
  9. convoviz/assets/fonts/BebasNeue-Regular.ttf +0 -0
  10. convoviz/assets/fonts/Borel-Regular.ttf +0 -0
  11. convoviz/assets/fonts/Courgette-Regular.ttf +0 -0
  12. convoviz/assets/fonts/CroissantOne-Regular.ttf +0 -0
  13. convoviz/assets/fonts/Handjet-Regular.ttf +0 -0
  14. convoviz/assets/fonts/IndieFlower-Regular.ttf +0 -0
  15. convoviz/assets/fonts/Kalam-Regular.ttf +0 -0
  16. convoviz/assets/fonts/Lobster-Regular.ttf +0 -0
  17. convoviz/assets/fonts/MartianMono-Regular.ttf +0 -0
  18. convoviz/assets/fonts/MartianMono-Thin.ttf +0 -0
  19. convoviz/assets/fonts/Montserrat-Regular.ttf +0 -0
  20. convoviz/assets/fonts/Mooli-Regular.ttf +0 -0
  21. convoviz/assets/fonts/Pacifico-Regular.ttf +0 -0
  22. convoviz/assets/fonts/PlayfairDisplay-Regular.ttf +0 -0
  23. convoviz/assets/fonts/Raleway-Regular.ttf +0 -0
  24. convoviz/assets/fonts/RobotoMono-Regular.ttf +0 -0
  25. convoviz/assets/fonts/RobotoMono-Thin.ttf +0 -0
  26. convoviz/assets/fonts/RobotoSlab-Regular.ttf +0 -0
  27. convoviz/assets/fonts/RobotoSlab-Thin.ttf +0 -0
  28. convoviz/assets/fonts/Ruwudu-Regular.ttf +0 -0
  29. convoviz/assets/fonts/Sacramento-Regular.ttf +0 -0
  30. convoviz/assets/fonts/SedgwickAveDisplay-Regular.ttf +0 -0
  31. convoviz/assets/fonts/ShadowsIntoLight-Regular.ttf +0 -0
  32. convoviz/assets/fonts/TitilliumWeb-Regular.ttf +0 -0
  33. convoviz/assets/fonts/Yellowtail-Regular.ttf +0 -0
  34. convoviz/assets/fonts/YsabeauOffice-Regular.ttf +0 -0
  35. convoviz/assets/fonts/YsabeauSC-Regular.ttf +0 -0
  36. convoviz/assets/fonts/YsabeauSC-Thin.ttf +0 -0
  37. convoviz/assets/fonts/Zeyada-Regular.ttf +0 -0
  38. convoviz/assets/stopwords.txt +1 -0
  39. convoviz/cli.py +149 -0
  40. convoviz/config.py +120 -0
  41. convoviz/exceptions.py +47 -0
  42. convoviz/interactive.py +264 -0
  43. convoviz/io/__init__.py +21 -0
  44. convoviz/io/assets.py +109 -0
  45. convoviz/io/loaders.py +191 -0
  46. convoviz/io/writers.py +231 -0
  47. convoviz/logging_config.py +69 -0
  48. convoviz/models/__init__.py +24 -0
  49. convoviz/models/collection.py +115 -0
  50. convoviz/models/conversation.py +158 -0
  51. convoviz/models/message.py +218 -0
  52. convoviz/models/node.py +66 -0
  53. convoviz/pipeline.py +184 -0
  54. convoviz/py.typed +0 -0
  55. convoviz/renderers/__init__.py +10 -0
  56. convoviz/renderers/markdown.py +269 -0
  57. convoviz/renderers/yaml.py +119 -0
  58. convoviz/utils.py +155 -0
  59. convoviz-0.4.1.dist-info/METADATA +215 -0
  60. convoviz-0.4.1.dist-info/RECORD +62 -0
  61. convoviz-0.4.1.dist-info/WHEEL +4 -0
  62. convoviz-0.4.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,204 @@
1
+ """Word cloud generation for conversation text."""
2
+
3
+ import logging
4
+ import os
5
+ from concurrent.futures import ProcessPoolExecutor
6
+ from functools import lru_cache
7
+ from pathlib import Path
8
+
9
+ from nltk import download as nltk_download
10
+ from nltk.corpus import stopwords as nltk_stopwords
11
+ from nltk.data import find as nltk_find
12
+ from PIL.Image import Image
13
+ from tqdm import tqdm
14
+ from wordcloud import WordCloud
15
+
16
+ from convoviz.config import WordCloudConfig
17
+ from convoviz.models import ConversationCollection
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Languages for stopwords
22
+ STOPWORD_LANGUAGES = [
23
+ "arabic",
24
+ "english",
25
+ "french",
26
+ "german",
27
+ "spanish",
28
+ "portuguese",
29
+ ]
30
+
31
+
32
+ @lru_cache(maxsize=1)
33
+ def load_programming_stopwords() -> frozenset[str]:
34
+ """Load programming keywords and types from assets.
35
+
36
+ Returns:
37
+ Frozen set of programming stop words
38
+ """
39
+ stopwords_path = Path(__file__).parent.parent / "assets" / "stopwords.txt"
40
+ if not stopwords_path.exists():
41
+ return frozenset()
42
+
43
+ with open(stopwords_path, encoding="utf-8") as f:
44
+ return frozenset(
45
+ line.strip().lower() for line in f if line.strip() and not line.strip().startswith("#")
46
+ )
47
+
48
+
49
+ @lru_cache(maxsize=1)
50
+ def load_nltk_stopwords() -> frozenset[str]:
51
+ """Load and cache NLTK stopwords.
52
+
53
+ Downloads stopwords if not already present.
54
+
55
+ Returns:
56
+ Frozen set of stopwords from multiple languages
57
+ """
58
+ try:
59
+ nltk_find("corpora/stopwords")
60
+ except LookupError:
61
+ nltk_download("stopwords", quiet=True)
62
+
63
+ words: set[str] = set()
64
+ for lang in STOPWORD_LANGUAGES:
65
+ words.update(nltk_stopwords.words(fileids=lang))
66
+
67
+ return frozenset(words)
68
+
69
+
70
+ def parse_custom_stopwords(stopwords_str: str | None) -> set[str]:
71
+ """Parse a comma-separated string of custom stopwords.
72
+
73
+ Args:
74
+ stopwords_str: Comma-separated stopwords
75
+
76
+ Returns:
77
+ Set of lowercase, stripped stopwords
78
+ """
79
+ if not stopwords_str:
80
+ return set()
81
+
82
+ return {word.strip().lower() for word in stopwords_str.split(",") if word.strip()}
83
+
84
+
85
+ def generate_wordcloud(text: str, config: WordCloudConfig) -> Image:
86
+ """Generate a word cloud from text.
87
+
88
+ Args:
89
+ text: The text to create a word cloud from
90
+ config: Word cloud configuration
91
+
92
+ Returns:
93
+ PIL Image of the word cloud
94
+ """
95
+ # Combine NLTK and custom stopwords
96
+ stopwords = set(load_nltk_stopwords())
97
+ stopwords.update(parse_custom_stopwords(config.custom_stopwords))
98
+
99
+ if config.exclude_programming_keywords:
100
+ stopwords.update(load_programming_stopwords())
101
+
102
+ wc = WordCloud(
103
+ font_path=str(config.font_path) if config.font_path else None,
104
+ width=config.width,
105
+ height=config.height,
106
+ stopwords=stopwords,
107
+ background_color=config.background_color,
108
+ mode=config.mode,
109
+ colormap=config.colormap,
110
+ include_numbers=config.include_numbers,
111
+ )
112
+
113
+ wc.generate(text)
114
+ result: Image = wc.to_image()
115
+ return result
116
+
117
+
118
+ def _generate_and_save_wordcloud(args: tuple[str, str, Path, WordCloudConfig]) -> bool:
119
+ """Worker function for parallel wordcloud generation.
120
+
121
+ Must be at module level for pickling by ProcessPoolExecutor.
122
+
123
+ Args:
124
+ args: Tuple of (text, filename, output_dir, config)
125
+
126
+ Returns:
127
+ True if wordcloud was generated, False if skipped (empty text)
128
+ """
129
+ text, filename, output_dir, config = args
130
+ if not text.strip():
131
+ return False
132
+ img = generate_wordcloud(text, config)
133
+ img.save(output_dir / filename, optimize=True)
134
+ return True
135
+
136
+
137
+ def generate_wordclouds(
138
+ collection: ConversationCollection,
139
+ output_dir: Path,
140
+ config: WordCloudConfig,
141
+ *,
142
+ progress_bar: bool = False,
143
+ ) -> None:
144
+ """Generate word clouds for weekly, monthly, and yearly groupings.
145
+
146
+ Uses parallel processing to speed up generation on multi-core systems.
147
+
148
+ Args:
149
+ collection: Collection of conversations
150
+ output_dir: Directory to save the word clouds
151
+ config: Word cloud configuration
152
+ progress_bar: Whether to show progress bars
153
+ """
154
+ output_dir.mkdir(parents=True, exist_ok=True)
155
+ logger.info(f"Generating wordclouds to {output_dir}")
156
+
157
+ week_groups = collection.group_by_week()
158
+ month_groups = collection.group_by_month()
159
+ year_groups = collection.group_by_year()
160
+
161
+ # Pre-load/download NLTK stopwords in the main process to avoid race conditions in workers
162
+ load_nltk_stopwords()
163
+
164
+ # Build list of all tasks: (text, filename, output_dir, config)
165
+ tasks: list[tuple[str, str, Path, WordCloudConfig]] = []
166
+
167
+ for week, group in week_groups.items():
168
+ text = group.plaintext("user", "assistant")
169
+ # Format: 2024-W15.png (ISO week format)
170
+ filename = f"{week.strftime('%Y-W%W')}.png"
171
+ tasks.append((text, filename, output_dir, config))
172
+
173
+ for month, group in month_groups.items():
174
+ text = group.plaintext("user", "assistant")
175
+ # Format: 2024-03-March.png (consistent with folder naming)
176
+ filename = f"{month.strftime('%Y-%m-%B')}.png"
177
+ tasks.append((text, filename, output_dir, config))
178
+
179
+ for year, group in year_groups.items():
180
+ text = group.plaintext("user", "assistant")
181
+ # Format: 2024.png
182
+ filename = f"{year.strftime('%Y')}.png"
183
+ tasks.append((text, filename, output_dir, config))
184
+
185
+ if not tasks:
186
+ return
187
+
188
+ # Determine worker count: use config if set, otherwise half CPU count (min 1)
189
+ max_workers = config.max_workers
190
+ if max_workers is None:
191
+ cpu_count = os.cpu_count() or 2
192
+ max_workers = max(1, cpu_count // 2)
193
+
194
+ # Use parallel processing for speedup on multi-core systems
195
+ logger.debug(f"Starting wordcloud generation with {max_workers} workers for {len(tasks)} tasks")
196
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
197
+ list(
198
+ tqdm(
199
+ executor.map(_generate_and_save_wordcloud, tasks),
200
+ total=len(tasks),
201
+ desc="Creating wordclouds 🔡☁️",
202
+ disable=not progress_bar,
203
+ )
204
+ )
@@ -0,0 +1,15 @@
1
+ viridis
2
+ plasma
3
+ inferno
4
+ magma
5
+ cividis
6
+ Blues
7
+ Greens
8
+ YlGnBu
9
+ YlOrRd
10
+ RdYlBu
11
+ Spectral
12
+ coolwarm
13
+ terrain
14
+ ocean
15
+ flag
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+
convoviz/cli.py ADDED
@@ -0,0 +1,149 @@
1
+ """Command-line interface for convoviz."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.markup import escape
9
+
10
+ from convoviz.config import FolderOrganization, OutputKind, get_default_config
11
+ from convoviz.exceptions import ConfigurationError, InvalidZipError
12
+ from convoviz.interactive import run_interactive_config
13
+ from convoviz.io.loaders import find_latest_zip
14
+ from convoviz.logging_config import setup_logging
15
+ from convoviz.pipeline import run_pipeline
16
+ from convoviz.utils import default_font_path
17
+
18
+ app = typer.Typer(
19
+ add_completion=False,
20
+ help="ChatGPT Data Visualizer 📊 - Convert and visualize your ChatGPT history",
21
+ )
22
+ console = Console()
23
+
24
+
25
+ @app.callback(invoke_without_command=True)
26
+ def run(
27
+ ctx: typer.Context,
28
+ input_path: Path | None = typer.Option(
29
+ None,
30
+ "--input",
31
+ "--zip",
32
+ "-z",
33
+ help="Path to the ChatGPT export zip file, JSON file, or extracted directory.",
34
+ exists=True,
35
+ file_okay=True,
36
+ dir_okay=True,
37
+ ),
38
+ output_dir: Path | None = typer.Option(
39
+ None,
40
+ "--output",
41
+ "-o",
42
+ help="Path to the output directory.",
43
+ ),
44
+ outputs: list[OutputKind] | None = typer.Option(
45
+ None,
46
+ "--outputs",
47
+ help="Output types to generate (repeatable). Options: markdown, graphs, wordclouds. "
48
+ "If not specified, all outputs are generated.",
49
+ ),
50
+ flat: bool = typer.Option(
51
+ False,
52
+ "--flat",
53
+ "-f",
54
+ help="Put all markdown files in a single folder (disables date organization).",
55
+ ),
56
+ interactive: bool | None = typer.Option(
57
+ None,
58
+ "--interactive/--no-interactive",
59
+ "-i/-I",
60
+ help="Force interactive mode on or off.",
61
+ ),
62
+ verbose: int = typer.Option(
63
+ 0,
64
+ "--verbose",
65
+ "-v",
66
+ help="Increase verbosity. Use -vv for debug.",
67
+ count=True,
68
+ ),
69
+ log_file: Path | None = typer.Option(
70
+ None,
71
+ "--log-file",
72
+ help="Path to log file. Defaults to a temporary file.",
73
+ ),
74
+ ) -> None:
75
+ """Convert ChatGPT export data to markdown and generate visualizations."""
76
+ # Setup logging immediately
77
+ log_path = setup_logging(verbose, log_file)
78
+ logger = logging.getLogger("convoviz.cli")
79
+ console.print(f"[dim]Logging to: {log_path}[/dim]")
80
+ logger.debug(f"Logging initialized. Output: {log_path}")
81
+
82
+ if ctx.invoked_subcommand is not None:
83
+ return
84
+
85
+ # Start with default config
86
+ config = get_default_config()
87
+
88
+ # Override with CLI args
89
+ if input_path:
90
+ config.input_path = input_path
91
+ if output_dir:
92
+ config.output_folder = output_dir
93
+ if outputs:
94
+ config.outputs = set(outputs)
95
+ if flat:
96
+ config.folder_organization = FolderOrganization.FLAT
97
+
98
+ # Determine mode: interactive if explicitly requested or no input provided
99
+ use_interactive = interactive if interactive is not None else (input_path is None)
100
+
101
+ if use_interactive:
102
+ console.print("Welcome to ChatGPT Data Visualizer ✨📊!\n")
103
+ try:
104
+ config = run_interactive_config(config)
105
+ except KeyboardInterrupt:
106
+ console.print("\n[yellow]Cancelled by user.[/yellow]")
107
+ raise typer.Exit(code=0) from None
108
+ else:
109
+ # Non-interactive mode: validate we have what we need
110
+ if not config.input_path:
111
+ # Try to find a default
112
+ latest = find_latest_zip()
113
+ if latest:
114
+ console.print(f"No input specified, using latest zip found: {latest}")
115
+ config.input_path = latest
116
+ else:
117
+ console.print(
118
+ "[bold red]Error:[/bold red] No input file provided and none found in Downloads."
119
+ )
120
+ raise typer.Exit(code=1)
121
+
122
+ # Validate the input (basic check)
123
+ if not config.input_path.exists():
124
+ console.print(
125
+ f"[bold red]Error:[/bold red] Input path does not exist: {config.input_path}"
126
+ )
127
+ raise typer.Exit(code=1)
128
+
129
+ # Set default font if not set
130
+ if not config.wordcloud.font_path:
131
+ config.wordcloud.font_path = default_font_path()
132
+
133
+ # Run the pipeline
134
+ try:
135
+ run_pipeline(config)
136
+ except (InvalidZipError, ConfigurationError) as e:
137
+ logger.error(f"Known error: {e}")
138
+ console.print(f"[bold red]Error:[/bold red] {escape(str(e))}")
139
+ raise typer.Exit(code=1) from None
140
+ except Exception as e:
141
+ logger.exception("Unexpected error occurred")
142
+ console.print(f"[bold red]Unexpected error:[/bold red] {escape(str(e))}")
143
+ console.print(f"[dim]See log file for details: {log_path}[/dim]")
144
+ raise typer.Exit(code=1) from None
145
+
146
+
147
+ def main_entry() -> None:
148
+ """Entry point for the CLI."""
149
+ app()
convoviz/config.py ADDED
@@ -0,0 +1,120 @@
1
+ """Configuration models using Pydantic v2."""
2
+
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class FolderOrganization(str, Enum):
11
+ """How to organize markdown output files in folders."""
12
+
13
+ FLAT = "flat" # All files in one directory
14
+ DATE = "date" # Nested by year/month (default)
15
+
16
+
17
+ class OutputKind(str, Enum):
18
+ """Types of outputs that can be generated."""
19
+
20
+ MARKDOWN = "markdown" # Conversation markdown files
21
+ GRAPHS = "graphs" # Usage analytics graphs
22
+ WORDCLOUDS = "wordclouds" # Word cloud visualizations
23
+
24
+
25
+ # Default: generate all outputs
26
+ ALL_OUTPUTS: frozenset[OutputKind] = frozenset(OutputKind)
27
+
28
+
29
+ class AuthorHeaders(BaseModel):
30
+ """Headers for different message authors in markdown output."""
31
+
32
+ system: str = "### System"
33
+ user: str = "# Me"
34
+ assistant: str = "# ChatGPT"
35
+ tool: str = "### Tool output"
36
+
37
+
38
+ class MarkdownConfig(BaseModel):
39
+ """Configuration for markdown output."""
40
+
41
+ latex_delimiters: Literal["default", "dollars"] = "dollars"
42
+ flavor: Literal["standard", "obsidian"] = "standard"
43
+
44
+
45
+ class YAMLConfig(BaseModel):
46
+ """Configuration for YAML frontmatter in markdown files."""
47
+
48
+ title: bool = True
49
+ tags: bool = False
50
+ chat_link: bool = True
51
+ create_time: bool = True
52
+ update_time: bool = True
53
+ model: bool = True
54
+ used_plugins: bool = False
55
+ message_count: bool = True
56
+ content_types: bool = False
57
+ custom_instructions: bool = False
58
+
59
+
60
+ class ConversationConfig(BaseModel):
61
+ """Configuration for conversation rendering."""
62
+
63
+ markdown: MarkdownConfig = Field(default_factory=MarkdownConfig)
64
+ yaml: YAMLConfig = Field(default_factory=YAMLConfig)
65
+
66
+
67
+ class MessageConfig(BaseModel):
68
+ """Configuration for message rendering."""
69
+
70
+ author_headers: AuthorHeaders = Field(default_factory=AuthorHeaders)
71
+
72
+
73
+ class WordCloudConfig(BaseModel):
74
+ """Configuration for word cloud generation."""
75
+
76
+ font_path: Path | None = None
77
+ colormap: str = "RdYlBu"
78
+ custom_stopwords: str = "use, file, "
79
+ exclude_programming_keywords: bool = True
80
+ background_color: str | None = None
81
+ mode: Literal["RGB", "RGBA"] = "RGBA"
82
+ include_numbers: bool = False
83
+ width: int = 600
84
+ height: int = 600
85
+ max_workers: int | None = None # None = use half CPU count
86
+
87
+
88
+ class GraphConfig(BaseModel):
89
+ """Configuration for graph generation."""
90
+
91
+ color: str = "#4A90E2"
92
+ grid: bool = True
93
+ show_counts: bool = True
94
+ font_name: str = "Montserrat-Regular.ttf"
95
+ figsize: tuple[int, int] = (10, 6)
96
+ dpi: int = 300
97
+ timezone: Literal["utc", "local"] = "local"
98
+ generate_monthly_breakdowns: bool = False
99
+ generate_yearly_breakdowns: bool = False
100
+
101
+
102
+ class ConvovizConfig(BaseModel):
103
+ """Main configuration for convoviz."""
104
+
105
+ input_path: Path | None = None
106
+ output_folder: Path = Field(default_factory=lambda: Path.home() / "Documents" / "ChatGPT-Data")
107
+ folder_organization: FolderOrganization = FolderOrganization.DATE
108
+ outputs: set[OutputKind] = Field(default_factory=lambda: set(ALL_OUTPUTS))
109
+ message: MessageConfig = Field(default_factory=MessageConfig)
110
+ conversation: ConversationConfig = Field(default_factory=ConversationConfig)
111
+ wordcloud: WordCloudConfig = Field(default_factory=WordCloudConfig)
112
+ graph: GraphConfig = Field(default_factory=GraphConfig)
113
+
114
+ model_config = {"validate_default": True}
115
+
116
+
117
+ # Default configuration instance
118
+ def get_default_config() -> ConvovizConfig:
119
+ """Get a fresh default configuration instance."""
120
+ return ConvovizConfig()
convoviz/exceptions.py ADDED
@@ -0,0 +1,47 @@
1
+ """Custom exceptions for convoviz."""
2
+
3
+
4
+ class ConvovizError(Exception):
5
+ """Base exception for all convoviz errors."""
6
+
7
+
8
+ class InvalidZipError(ConvovizError):
9
+ """Raised when a ZIP file is invalid or missing conversations.json."""
10
+
11
+ def __init__(self, path: str, reason: str = "missing conversations.json") -> None:
12
+ self.path = path
13
+ self.reason = reason
14
+ super().__init__(f"Invalid ZIP file '{path}': {reason}")
15
+
16
+
17
+ class ConfigurationError(ConvovizError):
18
+ """Raised for configuration-related errors."""
19
+
20
+ def __init__(self, message: str, field: str | None = None) -> None:
21
+ self.field = field
22
+ super().__init__(message)
23
+
24
+
25
+ class RenderingError(ConvovizError):
26
+ """Raised when rendering fails."""
27
+
28
+ def __init__(self, message: str, conversation_id: str | None = None) -> None:
29
+ self.conversation_id = conversation_id
30
+ super().__init__(message)
31
+
32
+
33
+ class MessageContentError(ConvovizError):
34
+ """Raised when message content cannot be extracted."""
35
+
36
+ def __init__(self, message_id: str) -> None:
37
+ self.message_id = message_id
38
+ super().__init__(f"No valid content found in message: {message_id}")
39
+
40
+
41
+ class FileNotFoundError(ConvovizError):
42
+ """Raised when a required file is not found."""
43
+
44
+ def __init__(self, path: str, file_type: str = "file") -> None:
45
+ self.path = path
46
+ self.file_type = file_type
47
+ super().__init__(f"{file_type.capitalize()} not found: {path}")