wikigen 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,194 @@
1
+ """
2
+ Enhanced help formatting for WikiGen CLI.
3
+ Provides colored, structured help output with icons and tree structure.
4
+ """
5
+
6
+ from ..metadata.project import HOMEPAGE_URL, CLI_ENTRY_POINT
7
+ from ..metadata.logo import print_logo
8
+
9
+
10
+ class HelpColors:
11
+ """ANSI 256-color codes for help formatting."""
12
+
13
+ WHITE = "\033[38;5;255m" # Headers, important text
14
+ LIGHT_GRAY = "\033[38;5;250m" # Tree structure lines
15
+ MEDIUM_GRAY = "\033[38;5;245m" # Descriptions, labels
16
+ DARK_GRAY = "\033[38;5;240m" # Subtle details
17
+ RESET = "\033[0m"
18
+
19
+
20
+ class HelpIcons:
21
+ """Unicode icons for help sections."""
22
+
23
+ INFO = "◆"
24
+ USAGE = "◎"
25
+ SOURCE = "◎"
26
+ OPTIONS = "⚙"
27
+ SUBCOMMANDS = "⚙"
28
+ EXAMPLES = "◆"
29
+ MORE_INFO = "◆"
30
+ ARROW = "→"
31
+ CHECK = "✓"
32
+
33
+
34
+ class HelpTree:
35
+ """Tree structure characters for help formatting."""
36
+
37
+ START = "┌─" # Start of section
38
+ MIDDLE = "├─" # Middle items
39
+ END = "└─" # Last item
40
+ VERTICAL = "│" # Vertical line
41
+ SPACE = " " # Space for indentation
42
+
43
+
44
+ def print_enhanced_help():
45
+ """Print enhanced help with logo, colors, and structure."""
46
+ # Print logo
47
+ print_logo()
48
+
49
+ # Print structured help sections
50
+ _print_usage_section()
51
+ _print_source_section()
52
+ _print_options_section()
53
+ _print_subcommands_section()
54
+ _print_examples_section()
55
+ _print_more_info_section()
56
+
57
+
58
+ def _print_usage_section():
59
+ """Print usage section."""
60
+ print(
61
+ f"{HelpColors.LIGHT_GRAY}┌─ {HelpColors.WHITE}{HelpIcons.USAGE} USAGE{HelpColors.RESET}"
62
+ )
63
+ print(
64
+ f"{HelpColors.LIGHT_GRAY}└─ {HelpColors.MEDIUM_GRAY}{CLI_ENTRY_POINT} [-h] run [url|path] [OPTIONS...]{HelpColors.RESET}"
65
+ )
66
+ print()
67
+
68
+
69
+ def _print_source_section():
70
+ """Print source options section."""
71
+ print(
72
+ f"{HelpColors.LIGHT_GRAY}┌─ {HelpColors.WHITE}{HelpIcons.SOURCE} SOURCE{HelpColors.RESET}"
73
+ )
74
+ print(
75
+ f"{HelpColors.LIGHT_GRAY}├─ {HelpColors.MEDIUM_GRAY}{CLI_ENTRY_POINT} run [url|path]{HelpColors.DARK_GRAY} {HelpIcons.INFO} Generate documentation (auto-detects URL or path){HelpColors.RESET}"
76
+ )
77
+ print(
78
+ f"{HelpColors.LIGHT_GRAY}│ {HelpColors.DARK_GRAY} {HelpIcons.INFO} url: GitHub repository URL (e.g., https://github.com/user/repo){HelpColors.RESET}"
79
+ )
80
+ print(
81
+ f"{HelpColors.LIGHT_GRAY}│ {HelpColors.DARK_GRAY} {HelpIcons.INFO} path: Local directory path (e.g., /path/to/project){HelpColors.RESET}"
82
+ )
83
+ print(
84
+ f"{HelpColors.LIGHT_GRAY}└─ {HelpColors.DARK_GRAY} {HelpIcons.INFO} (no argument): Current directory{HelpColors.RESET}"
85
+ )
86
+ print()
87
+
88
+
89
+ def _print_options_section():
90
+ """Print options section."""
91
+ print(
92
+ f"{HelpColors.LIGHT_GRAY}┌─ {HelpColors.WHITE}{HelpIcons.OPTIONS} OPTIONS{HelpColors.RESET}"
93
+ )
94
+
95
+ options = [
96
+ ("-h, --help", "Show this help message and exit"),
97
+ (
98
+ "-n, --name NAME",
99
+ "Project name (optional, derived from repo/directory if omitted)",
100
+ ),
101
+ (
102
+ "-t, --token TOKEN",
103
+ "GitHub personal access token (optional, reads from GITHUB_TOKEN env var)",
104
+ ),
105
+ ("-o, --output OUTPUT", "Base directory for output (default: from config)"),
106
+ ("-i, --include PATTERN", "Include file patterns (e.g. '*.py' '*.js')"),
107
+ ("-e, --exclude PATTERN", "Exclude file patterns (e.g. 'tests/*' 'docs/*')"),
108
+ ("-s, --max-size SIZE", "Maximum file size in bytes (default: from config)"),
109
+ (
110
+ "--language LANG",
111
+ "Language for the generated wiki (default: from config)",
112
+ ),
113
+ ("--no-cache", "Disable LLM response caching (default: caching enabled)"),
114
+ (
115
+ "--max-abstractions N",
116
+ "Maximum number of abstractions to identify (default: from config)",
117
+ ),
118
+ (
119
+ "--mode MODE",
120
+ "Documentation mode: minimal or comprehensive (default: from config)",
121
+ ),
122
+ ("--ci", "Enable CI mode (non-interactive, uses defaults)"),
123
+ (
124
+ "--update",
125
+ "Update existing documentation instead of overwriting",
126
+ ),
127
+ (
128
+ "--output-path PATH",
129
+ "Custom output path for documentation",
130
+ ),
131
+ (
132
+ "--check-changes",
133
+ "Exit with code 1 if docs changed (for CI checks)",
134
+ ),
135
+ ]
136
+
137
+ for i, (option, description) in enumerate(options):
138
+ is_last = i == len(options) - 1
139
+ prefix = f"{HelpColors.LIGHT_GRAY}{'└─' if is_last else '├─'}{HelpColors.RESET}"
140
+ print(
141
+ f"{prefix} {HelpColors.MEDIUM_GRAY}{option:<25}{HelpColors.DARK_GRAY} {HelpIcons.INFO} {description}{HelpColors.RESET}"
142
+ )
143
+
144
+ print()
145
+
146
+
147
+ def _print_subcommands_section():
148
+ """Print subcommands section."""
149
+ print(
150
+ f"{HelpColors.LIGHT_GRAY}┌─ {HelpColors.WHITE}{HelpIcons.SUBCOMMANDS} SUBCOMMANDS{HelpColors.RESET}"
151
+ )
152
+ print(
153
+ f"{HelpColors.LIGHT_GRAY}├─ {HelpColors.MEDIUM_GRAY}run [url|path]{HelpColors.DARK_GRAY} {HelpIcons.INFO} Generate documentation (auto-detects URL or path){HelpColors.RESET}"
154
+ )
155
+ print(
156
+ f"{HelpColors.LIGHT_GRAY}├─ {HelpColors.MEDIUM_GRAY}init{HelpColors.DARK_GRAY} {HelpIcons.INFO} Set up configuration{HelpColors.RESET}"
157
+ )
158
+ print(
159
+ f"{HelpColors.LIGHT_GRAY}└─ {HelpColors.MEDIUM_GRAY}config <command>{HelpColors.DARK_GRAY} {HelpIcons.INFO} Manage configuration{HelpColors.RESET}"
160
+ )
161
+ print()
162
+
163
+
164
+ def _print_examples_section():
165
+ """Print examples section."""
166
+ print(
167
+ f"{HelpColors.LIGHT_GRAY}┌─ {HelpColors.WHITE}{HelpIcons.EXAMPLES} EXAMPLES{HelpColors.RESET}"
168
+ )
169
+
170
+ examples = [
171
+ f"{CLI_ENTRY_POINT} run {HelpIcons.INFO} Current directory",
172
+ f"{CLI_ENTRY_POINT} run https://github.com/user/repo {HelpIcons.INFO} GitHub repo",
173
+ f"{CLI_ENTRY_POINT} run /path/to/project {HelpIcons.INFO} Local directory",
174
+ f"{CLI_ENTRY_POINT} init",
175
+ f"{CLI_ENTRY_POINT} config show",
176
+ f'{CLI_ENTRY_POINT} config update-gemini-key "your-key"',
177
+ ]
178
+
179
+ for i, example in enumerate(examples):
180
+ is_last = i == len(examples) - 1
181
+ prefix = f"{HelpColors.LIGHT_GRAY}{'└─' if is_last else '├─'}{HelpColors.RESET}"
182
+ print(f"{prefix} {HelpColors.MEDIUM_GRAY}{example}{HelpColors.RESET}")
183
+
184
+ print()
185
+
186
+
187
+ def _print_more_info_section():
188
+ """Print more info section."""
189
+ print(
190
+ f"{HelpColors.LIGHT_GRAY}┌─ {HelpColors.WHITE}{HelpIcons.MORE_INFO} MORE INFO{HelpColors.RESET}"
191
+ )
192
+ print(
193
+ f"{HelpColors.LIGHT_GRAY}└─ {HelpColors.MEDIUM_GRAY}Visit: {HelpColors.WHITE}{HOMEPAGE_URL}{HelpColors.RESET}"
194
+ )
@@ -0,0 +1,56 @@
1
+ """
2
+ Init mode formatter for WikiGen CLI.
3
+ Provides structured visual output for the configuration setup process.
4
+ """
5
+
6
+ from .output_formatter import Colors, Icons, Tree
7
+ from ..metadata.logo import print_logo
8
+
9
+
10
+ def print_init_header():
11
+ """Print the logo and setup header."""
12
+ print_logo()
13
+ print() # Blank line for spacing
14
+ print(
15
+ f"{Colors.LIGHT_GRAY}{Tree.START} {Colors.WHITE}{Icons.CONFIG} "
16
+ f"Configuration Setup{Colors.RESET}"
17
+ )
18
+
19
+
20
+ def print_section_start(name, icon):
21
+ """Print the start of a configuration section."""
22
+ print(f"{Colors.LIGHT_GRAY}{Tree.MIDDLE} {Colors.WHITE}{icon} {name}{Colors.RESET}")
23
+
24
+
25
+ def print_input_prompt(label, icon, is_required=True, default_value=None):
26
+ """Print an input prompt with proper tree structure."""
27
+ required_text = " (required)" if is_required else " (optional, press Enter to skip)"
28
+ default_text = f" [{default_value}]" if default_value else ""
29
+
30
+ print(
31
+ f"{Colors.LIGHT_GRAY}{Tree.VERTICAL} {Colors.LIGHT_GRAY}{Tree.MIDDLE} "
32
+ f"{Colors.MEDIUM_GRAY}{icon} {label}{required_text}{default_text}{Colors.RESET}"
33
+ )
34
+ print(
35
+ f"{Colors.LIGHT_GRAY}{Tree.VERTICAL} {Colors.LIGHT_GRAY}{Tree.VERTICAL} "
36
+ f"{Colors.MEDIUM_GRAY}→ {Colors.RESET}",
37
+ end="",
38
+ )
39
+
40
+
41
+ def print_init_complete(config_path, output_dir, keyring_available):
42
+ """Print the final completion message."""
43
+ print(
44
+ f"{Colors.LIGHT_GRAY}{Tree.END} {Colors.WHITE}{Icons.SUCCESS} "
45
+ f"Configuration Complete{Colors.RESET}"
46
+ )
47
+ print()
48
+ print(f"{Colors.WHITE}{Icons.SUCCESS} Saved to {config_path}{Colors.RESET}")
49
+
50
+ keyring_status = (
51
+ "Enabled (secure storage)"
52
+ if keyring_available
53
+ else "Not available (saved to config file)"
54
+ )
55
+ print(f"{Colors.MEDIUM_GRAY}{Icons.INFO} Keyring: {keyring_status}{Colors.RESET}")
56
+ print(f"{Colors.MEDIUM_GRAY}📂 {Colors.WHITE}{output_dir}{Colors.RESET}")
@@ -0,0 +1,290 @@
1
+ """
2
+ Output formatting utilities for WikiGen CLI.
3
+ Provides tree-structured output with icons, colors, and timing.
4
+ """
5
+
6
+
7
+ # ANSI 256-color codes (work on both light and dark backgrounds)
8
+ class Colors:
9
+ WHITE = "\033[38;5;255m" # Phase headers, success
10
+ LIGHT_GRAY = "\033[38;5;250m" # Tree structure
11
+ MEDIUM_GRAY = "\033[38;5;245m" # Operation text
12
+ DARK_GRAY = "\033[38;5;240m" # Timing, file sizes
13
+ RESET = "\033[0m"
14
+
15
+
16
+ # Unicode icons for different operations
17
+ class Icons:
18
+ # Configuration
19
+ CONFIG = "⚙"
20
+ INFO = "◆"
21
+
22
+ # Repository operations
23
+ CRAWLING = "◎"
24
+ DOWNLOAD = "↓"
25
+ SKIP = "○"
26
+
27
+ # LLM operations
28
+ PROCESSING = "⟳"
29
+ ANALYZING = "◉"
30
+ ORDERING = "◈"
31
+
32
+ # Content generation
33
+ WRITING = "✎"
34
+ GENERATING = "◊"
35
+
36
+ # File operations
37
+ CREATING = "▸"
38
+
39
+ # Status
40
+ SUCCESS = "✓"
41
+ ERROR = "✗"
42
+ WARNING = "⚠"
43
+
44
+
45
+ # Tree structure characters
46
+ class Tree:
47
+ START = "┌─" # Start of section
48
+ MIDDLE = "├─" # Middle items
49
+ END = "└─" # Last item
50
+ VERTICAL = "│" # Vertical line
51
+ SPACE = " " # Space for indentation
52
+
53
+
54
+ class PhaseTracker:
55
+ """Track current phase state for proper tree structure."""
56
+
57
+ def __init__(self):
58
+ self.depth = 0
59
+ self.in_phase = False
60
+ self.phase_items = 0
61
+
62
+ def start_phase(self):
63
+ """Start a new phase."""
64
+ self.in_phase = True
65
+ self.phase_items = 0
66
+ self.depth = 0
67
+
68
+ def end_phase(self):
69
+ """End current phase."""
70
+ self.in_phase = False
71
+ self.depth = 0
72
+
73
+ def add_item(self):
74
+ """Add an item to current phase."""
75
+ self.phase_items += 1
76
+
77
+
78
+ # Global tracker instance
79
+ _tracker = PhaseTracker()
80
+
81
+
82
+ def format_time(seconds):
83
+ """Format elapsed time as [X.Xs]."""
84
+ return f"[{seconds:.1f}s]"
85
+
86
+
87
+ def format_size(bytes_size):
88
+ """Format file size in human-readable format."""
89
+ if bytes_size < 1024:
90
+ return f"{bytes_size} bytes"
91
+ elif bytes_size < 1024 * 1024:
92
+ return f"{bytes_size / 1024:.1f} KB"
93
+ else:
94
+ return f"{bytes_size / (1024 * 1024):.1f} MB"
95
+
96
+
97
+ def print_header(version=None):
98
+ """Print the CLI header with version and configuration info."""
99
+ if version is None:
100
+ from ..metadata import __version__
101
+
102
+ version = __version__
103
+
104
+ print(f"{Colors.WHITE}WikiGen {Colors.LIGHT_GRAY}v{version}{Colors.RESET}")
105
+
106
+
107
+ def print_info(label, value):
108
+ """Print configuration information line."""
109
+ print(
110
+ f"{Colors.MEDIUM_GRAY}{Icons.INFO} {label}: {Colors.WHITE}{value}{Colors.RESET}"
111
+ )
112
+
113
+
114
+ def print_phase_start(name, icon):
115
+ """
116
+ Print the start of a new phase (top-level section).
117
+ Example: "┌─ ◎ Repository Crawling"
118
+ """
119
+ _tracker.start_phase()
120
+ print() # Blank line before phase
121
+ print(f"{Colors.LIGHT_GRAY}{Tree.START} {Colors.WHITE}{icon} {name}{Colors.RESET}")
122
+
123
+
124
+ def print_operation(text, icon=None, indent=1, is_last=False, elapsed_time=None):
125
+ """
126
+ Print an operation within a phase with proper tree structure.
127
+
128
+ Args:
129
+ text: Operation description
130
+ icon: Icon to display (optional)
131
+ indent: Indentation level (1 for direct child, 2 for nested)
132
+ is_last: Whether this is the last item at this level
133
+ elapsed_time: Optional elapsed time to display inline
134
+ """
135
+ _tracker.add_item()
136
+
137
+ # Build indentation
138
+ prefix_parts = []
139
+ for i in range(indent):
140
+ if i < indent - 1:
141
+ prefix_parts.append(Colors.LIGHT_GRAY + Tree.VERTICAL + " ")
142
+ else:
143
+ if is_last:
144
+ prefix_parts.append(Colors.LIGHT_GRAY + Tree.END + " ")
145
+ else:
146
+ prefix_parts.append(Colors.LIGHT_GRAY + Tree.MIDDLE + " ")
147
+
148
+ prefix = "".join(prefix_parts)
149
+
150
+ # Format icon and text
151
+ if icon:
152
+ formatted_text = f"{Colors.MEDIUM_GRAY}{icon} {text}{Colors.RESET}"
153
+ else:
154
+ formatted_text = f"{Colors.MEDIUM_GRAY}{text}{Colors.RESET}"
155
+
156
+ # Add timing if provided
157
+ if elapsed_time is not None:
158
+ time_suffix = f" {Colors.DARK_GRAY}[{format_time(elapsed_time)}]{Colors.RESET}"
159
+ formatted_text += time_suffix
160
+
161
+ print(f"{prefix}{formatted_text}")
162
+
163
+
164
+ def print_success(text, elapsed_time=None, indent=1):
165
+ """
166
+ Print a success message with optional timing.
167
+ Example: "└─ ✓ Complete (43 files, 85.2 KB) [2.3s]"
168
+ """
169
+ # Build timing suffix
170
+ time_suffix = ""
171
+ if elapsed_time is not None:
172
+ time_suffix = f" {Colors.DARK_GRAY}{format_time(elapsed_time)}{Colors.RESET}"
173
+
174
+ # Build prefix
175
+ prefix_parts = []
176
+ for i in range(indent):
177
+ if i < indent - 1:
178
+ prefix_parts.append(Colors.LIGHT_GRAY + Tree.VERTICAL + " ")
179
+ else:
180
+ prefix_parts.append(Colors.LIGHT_GRAY + Tree.END + " ")
181
+
182
+ prefix = "".join(prefix_parts)
183
+
184
+ print(f"{prefix}{Colors.WHITE}{Icons.SUCCESS} {text}{time_suffix}{Colors.RESET}")
185
+
186
+
187
+ def print_phase_end():
188
+ """End the current phase (adds vertical connector if needed)."""
189
+ print(f"{Colors.LIGHT_GRAY}{Tree.VERTICAL}{Colors.RESET}")
190
+ _tracker.end_phase()
191
+
192
+
193
+ def print_final_success(message, total_time, output_path):
194
+ """
195
+ Print final success message with total time and output location.
196
+ Example:
197
+ ✓ Success! Documents generated [66.2s total]
198
+ 📂 /Users/.../output/
199
+ """
200
+ print() # Blank line before final message
201
+ print(
202
+ f"{Colors.WHITE}{Icons.SUCCESS} {message} {Colors.DARK_GRAY}{format_time(total_time)} total{Colors.RESET}"
203
+ )
204
+ print(f"{Colors.MEDIUM_GRAY}📂 {Colors.WHITE}{output_path}{Colors.RESET}")
205
+
206
+
207
+ def print_error_missing_api_key(provider_display: str = "API"):
208
+ """Print error message for missing API key."""
209
+ from ..metadata import CLI_ENTRY_POINT
210
+
211
+ print()
212
+ print(
213
+ f"{Colors.WHITE}{Icons.ERROR} Error: {provider_display} API key not found{Colors.RESET}"
214
+ )
215
+ print(f"{Colors.MEDIUM_GRAY} To configure your API key, run:{Colors.RESET}")
216
+ print(
217
+ f"{Colors.WHITE} {CLI_ENTRY_POINT} config update-api-key <provider>{Colors.RESET}"
218
+ )
219
+ print(
220
+ f"{Colors.MEDIUM_GRAY} Or set the appropriate API key environment variable{Colors.RESET}"
221
+ )
222
+
223
+
224
+ def print_error_invalid_api_key():
225
+ """Print error message for invalid API key."""
226
+ from ..metadata import CLI_ENTRY_POINT
227
+
228
+ print()
229
+ print(
230
+ f"{Colors.WHITE}{Icons.ERROR} Error: Invalid or unauthorized API key{Colors.RESET}"
231
+ )
232
+ print(
233
+ f"{Colors.MEDIUM_GRAY} Your API key may be invalid or expired.{Colors.RESET}"
234
+ )
235
+ print(f"{Colors.MEDIUM_GRAY} To update your API key, run:{Colors.RESET}")
236
+ print(
237
+ f"{Colors.WHITE} {CLI_ENTRY_POINT} config update-api-key <provider>{Colors.RESET}"
238
+ )
239
+
240
+
241
+ def print_error_rate_limit():
242
+ """Print error message for rate limit errors."""
243
+ print()
244
+ print(f"{Colors.WHITE}{Icons.ERROR} Error: Rate limit exceeded{Colors.RESET}")
245
+ print(
246
+ f"{Colors.MEDIUM_GRAY} You've hit the API rate limit. Please wait and try again.{Colors.RESET}"
247
+ )
248
+ print(
249
+ f"{Colors.MEDIUM_GRAY} Consider using --no-cache flag to reduce API calls.{Colors.RESET}"
250
+ )
251
+
252
+
253
+ def print_error_network():
254
+ """Print error message for network errors."""
255
+ print()
256
+ print(f"{Colors.WHITE}{Icons.ERROR} Error: Network connection issue{Colors.RESET}")
257
+ print(
258
+ f"{Colors.MEDIUM_GRAY} Unable to connect to the API. Please check your internet connection.{Colors.RESET}"
259
+ )
260
+
261
+
262
+ def print_error_general(error):
263
+ """Print error message for general/unexpected errors."""
264
+ print()
265
+ print(
266
+ f"{Colors.WHITE}{Icons.ERROR} Error: An unexpected error occurred{Colors.RESET}"
267
+ )
268
+ print(f"{Colors.MEDIUM_GRAY} {str(error)}{Colors.RESET}")
269
+ print(
270
+ f"{Colors.MEDIUM_GRAY} Please check your configuration and try again.{Colors.RESET}"
271
+ )
272
+
273
+
274
+ def print_update_notification(current_version: str, latest_version: str):
275
+ """
276
+ Print update notification message at the end of successful execution.
277
+
278
+ Args:
279
+ current_version: Currently installed version
280
+ latest_version: Latest available version from PyPI
281
+ """
282
+ print()
283
+ print(
284
+ f"{Colors.WHITE}{Icons.INFO} Update available: "
285
+ f"{Colors.MEDIUM_GRAY}v{current_version}"
286
+ f"{Colors.WHITE} → {Colors.WHITE}v{latest_version}{Colors.RESET}"
287
+ )
288
+ print(
289
+ f"{Colors.MEDIUM_GRAY} To upgrade, run: {Colors.WHITE}pip install --upgrade wikigen{Colors.RESET}"
290
+ )
@@ -0,0 +1,12 @@
1
+ """MCP (Model Context Protocol) server for wikigen."""
2
+
3
+
4
+ # Lazy import to avoid requiring mcp package at module load time
5
+ def run_mcp_server():
6
+ """Entry point to run MCP server."""
7
+ from .server import run_mcp_server as _run
8
+
9
+ _run() # run_mcp_server() doesn't return, it runs the server
10
+
11
+
12
+ __all__ = ["run_mcp_server"]
@@ -0,0 +1,127 @@
1
+ """Markdown chunking utilities for semantic search.
2
+
3
+ This module provides intelligent chunking of markdown documents that respects
4
+ markdown structure (headers, code blocks) while maintaining configurable
5
+ chunk sizes and overlaps.
6
+ """
7
+
8
+ import re
9
+ from typing import List, Dict, Any
10
+
11
+
12
+ def chunk_markdown(
13
+ content: str, chunk_size: int = 500, overlap: int = 50
14
+ ) -> List[Dict[str, Any]]:
15
+ """
16
+ Chunk markdown content intelligently, respecting structure.
17
+
18
+ This function chunks markdown text while:
19
+ - Preserving code blocks (don't split them)
20
+ - Respecting headers (prefer chunking at headers)
21
+ - Maintaining configurable chunk size and overlap
22
+ - Preserving context across chunks
23
+
24
+ Args:
25
+ content: The markdown content to chunk
26
+ chunk_size: Target chunk size in tokens (approximate, using character count)
27
+ overlap: Number of tokens to overlap between chunks
28
+
29
+ Returns:
30
+ List of dictionaries with chunk information:
31
+ - 'content': The chunk text
32
+ - 'start_pos': Starting position in original content
33
+ - 'end_pos': Ending position in original content
34
+ - 'chunk_index': Index of this chunk (0-based)
35
+ """
36
+ if not content:
37
+ return []
38
+
39
+ # Approximate tokens: roughly 4 characters per token
40
+ char_size = chunk_size * 4
41
+ char_overlap = overlap * 4
42
+
43
+ chunks = []
44
+ current_pos = 0
45
+ chunk_index = 0
46
+
47
+ # Split by code blocks first to preserve them
48
+ code_block_pattern = r"```[\s\S]*?```"
49
+ code_blocks = list(re.finditer(code_block_pattern, content))
50
+
51
+ while current_pos < len(content):
52
+ # Find the end position for this chunk
53
+ end_pos = min(current_pos + char_size, len(content))
54
+
55
+ # If we're not at the end, try to find a good break point
56
+ if end_pos < len(content):
57
+ # Prefer breaking at headers (##, ###, etc.)
58
+ header_pattern = r"\n#{1,6}\s+"
59
+ header_match = re.search(
60
+ header_pattern, content[current_pos : end_pos + 100]
61
+ )
62
+ if header_match:
63
+ # Break at the header
64
+ end_pos = current_pos + header_match.start()
65
+ else:
66
+ # Try breaking at paragraph boundaries (double newline)
67
+ para_match = re.search(r"\n\n+", content[end_pos - 200 : end_pos + 100])
68
+ if para_match:
69
+ # Adjust end_pos to the paragraph break
70
+ end_pos = end_pos - 200 + para_match.end()
71
+ else:
72
+ # Try breaking at sentence boundaries
73
+ sentence_match = re.search(
74
+ r"[.!?]\s+", content[end_pos - 100 : end_pos + 50]
75
+ )
76
+ if sentence_match:
77
+ end_pos = end_pos - 100 + sentence_match.end()
78
+ else:
79
+ # Last resort: break at word boundary
80
+ word_match = re.search(
81
+ r"\s+", content[end_pos - 50 : end_pos + 50]
82
+ )
83
+ if word_match:
84
+ end_pos = end_pos - 50 + word_match.end()
85
+
86
+ # Check if we're in the middle of a code block
87
+ in_code_block = False
88
+ for cb_match in code_blocks:
89
+ if cb_match.start() < end_pos < cb_match.end():
90
+ # Extend to end of code block
91
+ end_pos = cb_match.end()
92
+ in_code_block = True
93
+ break
94
+
95
+ # Extract chunk content
96
+ chunk_content = content[current_pos:end_pos].strip()
97
+
98
+ # Only add chunk if it's meaningful (at least 100 chars to avoid tiny fragments)
99
+ if chunk_content and len(chunk_content) >= 100:
100
+ chunks.append(
101
+ {
102
+ "content": chunk_content,
103
+ "start_pos": current_pos,
104
+ "end_pos": end_pos,
105
+ "chunk_index": chunk_index,
106
+ }
107
+ )
108
+ chunk_index += 1
109
+
110
+ # Move to next chunk with overlap
111
+ if end_pos >= len(content):
112
+ break
113
+
114
+ # Calculate next start position with overlap
115
+ # Ensure we make meaningful progress (at least 50% of chunk size)
116
+ min_progress = char_size // 2
117
+ next_start = end_pos - char_overlap
118
+ if next_start <= current_pos:
119
+ # Ensure we make progress
120
+ next_start = current_pos + min_progress
121
+ elif (next_start - current_pos) < min_progress:
122
+ # If overlap would create too small a step, ensure minimum progress
123
+ next_start = current_pos + min_progress
124
+
125
+ current_pos = next_start
126
+
127
+ return chunks