data2prompt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
data2prompt/cli.py ADDED
@@ -0,0 +1,146 @@
1
+ import argparse
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Set
4
+ from pathlib import Path
5
+ from argparse import Namespace
6
+
7
+ from .constants import (
8
+ CORE_IGNORES,
9
+ CORE_IGNORE_FILES,
10
+ CORE_SKIP_EXTS,
11
+ DEFAULT_CSV_SAMPLE_SIZE,
12
+ DEFAULT_SQL_SAMPLE_SIZE,
13
+ DEFAULT_SQL_MAX_LINES,
14
+ DEFAULT_MAX_LINES,
15
+ DEFAULT_MAX_SHEETS,
16
+ DEFAULT_SEED,
17
+ DEFAULT_LINE_LENGTH_THRESHOLD,
18
+ DEFAULT_TRUNCATED_LINE_LENGTH,
19
+ DEFAULT_TABLE_CHAR_LIMIT,
20
+ DEFAULT_TABLE_TRUNCATED_SIZE,
21
+ DEFAULT_MAX_FILE_SIZE_KB,
22
+ DEFAULT_OUTPUT_FILE,
23
+ DEFAULT_FORMAT,
24
+ SUPPORTED_FORMATS
25
+ )
26
+
27
+ @dataclass
28
+ class Config:
29
+ """Data Transfer Object for application configuration."""
30
+ output: str
31
+ format: str
32
+ csv_sample_size: int
33
+ seed: int
34
+ sql_sample_size: int
35
+ sql_max_lines: int
36
+ max_lines: int
37
+ max_sheets: int
38
+ line_length_threshold: int
39
+ truncated_line_length: int
40
+ table_limit: int
41
+ table_truncate: int
42
+ ignore_folders: Set[str] = field(default_factory=set)
43
+ ignore_files: Set[str] = field(default_factory=set)
44
+ max_file_size: int = 0
45
+ skip_exts: Set[str] = field(default_factory=set)
46
+ use_gitignore: bool = True
47
+
48
+ def setup_cli() -> Config:
49
+ """Configures the Command Line Interface (CLI) for the tool.
50
+
51
+ Defines all available flags and their help descriptions.
52
+
53
+ Returns:
54
+ Config: A type-safe configuration object.
55
+ """
56
+ parser = argparse.ArgumentParser(
57
+ description="📊 Data2Prompt: High-tech prompt packaging for Data Scientists."
58
+ )
59
+
60
+ # Output settings
61
+ parser.add_argument('-o', '--output', default=DEFAULT_OUTPUT_FILE,
62
+ help=f'Base name of the generated file (default: {DEFAULT_OUTPUT_FILE})')
63
+
64
+ parser.add_argument('-f', '--format', choices=list(SUPPORTED_FORMATS.keys()), default=DEFAULT_FORMAT,
65
+ help=f'Output format: xml or markdown (default: {DEFAULT_FORMAT})')
66
+
67
+ # CSV sampling settings
68
+ parser.add_argument('-s', '--csv-sample-size', type=int, default=DEFAULT_CSV_SAMPLE_SIZE,
69
+ help=f'Number of random rows to sample from CSVs (default: {DEFAULT_CSV_SAMPLE_SIZE})')
70
+ parser.add_argument('--seed', type=int, default=DEFAULT_SEED,
71
+ help=f'Random seed for consistent CSV sampling (default: {DEFAULT_SEED})')
72
+
73
+ # SQL sampling settings
74
+ parser.add_argument('--sql-sample-size', type=int, default=DEFAULT_SQL_SAMPLE_SIZE,
75
+ help=f'Number of INSERT statements to keep in SQL files (default: {DEFAULT_SQL_SAMPLE_SIZE})')
76
+
77
+ parser.add_argument('--sql-max-lines', type=int, default=DEFAULT_SQL_MAX_LINES,
78
+ help=f'Max non-data lines to keep in SQL files (default: {DEFAULT_SQL_MAX_LINES})')
79
+
80
+ # Notebook settings
81
+ parser.add_argument('--max-lines', type=int, default=DEFAULT_MAX_LINES,
82
+ help=f'Max lines of text output to keep per notebook cell (default: {DEFAULT_MAX_LINES})')
83
+
84
+ # Excel settings
85
+ parser.add_argument('--max-sheets', type=int, default=DEFAULT_MAX_SHEETS,
86
+ help=f'Max number of sheets to process in Excel files (default: {DEFAULT_MAX_SHEETS})')
87
+
88
+ # Line Truncation settings
89
+ parser.add_argument('--line-length-threshold', type=int, default=DEFAULT_LINE_LENGTH_THRESHOLD,
90
+ help=f'Max characters per line before truncation (default: {DEFAULT_LINE_LENGTH_THRESHOLD})')
91
+ parser.add_argument('--truncated-line-length', type=int, default=DEFAULT_TRUNCATED_LINE_LENGTH,
92
+ help=f'Length to truncate long lines to (default: {DEFAULT_TRUNCATED_LINE_LENGTH})')
93
+
94
+ # Table Truncation settings
95
+ parser.add_argument('--table-limit', type=int, default=DEFAULT_TABLE_CHAR_LIMIT,
96
+ help=f'Max characters for a single table/sheet after sampling (default: {DEFAULT_TABLE_CHAR_LIMIT})')
97
+ parser.add_argument('--table-truncate', type=int, default=DEFAULT_TABLE_TRUNCATED_SIZE,
98
+ help=f'Length to truncate large tables to (default: {DEFAULT_TABLE_TRUNCATED_SIZE})')
99
+
100
+ # Exclusions
101
+ parser.add_argument('--ignore-folders', nargs='+', default=[],
102
+ help='Additional folders to skip entirely')
103
+
104
+ parser.add_argument('--ignore-files', nargs='+', default=[],
105
+ help='Additional files to skip entirely')
106
+
107
+ parser.add_argument('--max-file-size', type=int, default=DEFAULT_MAX_FILE_SIZE_KB,
108
+ help=f'Max file size in KB to read entirely (default: {DEFAULT_MAX_FILE_SIZE_KB}KB)')
109
+
110
+ # file formats to ignore
111
+ parser.add_argument('--skip-exts', nargs='+', default=[],
112
+ help='Additional file extensions to skip content for')
113
+
114
+ parser.add_argument('--no-gitignore', action='store_false', dest='use_gitignore',
115
+ help='Disable automatic .gitignore detection and filtering')
116
+
117
+ args = parser.parse_args()
118
+
119
+ # --- Argument Merging Logic ---
120
+ # We combine the user's terminal input with our CORE constants.
121
+ # This ensures that even if a user provides custom ignores, essential items
122
+ # like '.git' or binary extensions are still respected.
123
+
124
+ # Combine base name with format-specific extension
125
+ extension = SUPPORTED_FORMATS.get(args.format, SUPPORTED_FORMATS.get(DEFAULT_FORMAT))
126
+ final_output_name = f"{args.output}{extension}"
127
+
128
+ return Config(
129
+ output=final_output_name,
130
+ format=args.format,
131
+ csv_sample_size=args.csv_sample_size,
132
+ seed=args.seed,
133
+ sql_sample_size=args.sql_sample_size,
134
+ sql_max_lines=args.sql_max_lines,
135
+ max_lines=args.max_lines,
136
+ max_sheets=args.max_sheets,
137
+ line_length_threshold=args.line_length_threshold,
138
+ truncated_line_length=args.truncated_line_length,
139
+ table_limit=args.table_limit,
140
+ table_truncate=args.table_truncate,
141
+ ignore_folders=set(args.ignore_folders) | CORE_IGNORES,
142
+ ignore_files=set(args.ignore_files) | CORE_IGNORE_FILES,
143
+ max_file_size=args.max_file_size,
144
+ skip_exts=set(args.skip_exts) | CORE_SKIP_EXTS,
145
+ use_gitignore=args.use_gitignore
146
+ )
@@ -0,0 +1,94 @@
1
+ # --- Core Defaults & Constants ---
2
+
3
+ # Folders matching these names are excluded from both the project tree and content processing.
4
+ CORE_IGNORES = {
5
+ '.git', '__pycache__', 'venv', '.vscode', '.ipynb_checkpoints',
6
+ 'node_modules', '.idea', 'dist', 'build', '.mypy_cache',
7
+ '.pytest_cache', 'target', '.docker', '.aws', '.gcloud',
8
+ '__MACOSX'
9
+ }
10
+
11
+ # Specific filenames that should be excluded from the entire process.
12
+ CORE_IGNORE_FILES = set()
13
+
14
+ # Files with these extensions will have their names listed in the project tree,
15
+ # but their actual content will be skipped.
16
+ CORE_SKIP_EXTS = {
17
+ # Data & Databases
18
+ '.pbix', '.db', '.sqlite', '.sqlite3', '.parquet', '.pkl', '.pickle', '.feather', '.h5',
19
+ # Compressed & Binary
20
+ '.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.bin',
21
+ # Media
22
+ '.png', '.jpg', '.jpeg', '.gif', '.svg', '.pdf', '.mp4', '.mp3', '.mov',
23
+ # Environment & Secrets
24
+ '.env', '.venv', '.pyc', '.ds_store'
25
+ }
26
+
27
+ # Default values for CLI arguments and processing functions
28
+ DEFAULT_CSV_SAMPLE_SIZE = 15 # Controls the number of rows per csv file.
29
+ DEFAULT_SQL_SAMPLE_SIZE = 15 # Controls the number of INSERT/data rows kept per table in SQL files.
30
+ DEFAULT_SQL_MAX_LINES = 50 # Caps the total number of non-data lines (comments, setup, etc.) in SQL files.
31
+ DEFAULT_MAX_LINES = 40 # Max lines of text output to keep per notebook cell.
32
+ DEFAULT_MAX_SHEETS = 10 # Max number of sheets to process in Excel files.
33
+ DEFAULT_SEED = 42 # Random seed for consistent sampling.
34
+ DEFAULT_LINE_LENGTH_THRESHOLD = 4000 # Max characters allowed per line before truncation is triggered.
35
+ DEFAULT_TRUNCATED_LINE_LENGTH = 1000 # Number of characters to keep when a line is truncated.
36
+ DEFAULT_TABLE_CHAR_LIMIT = 50000 # Max characters allowed for a single table/sheet representation after sampling.
37
+ DEFAULT_TABLE_TRUNCATED_SIZE = 20000 # Number of characters to keep when a table/sheet is truncated due to size.
38
+ DEFAULT_MAX_FILE_SIZE_KB = 70 # maximum file size of unhandled type to keep enitrely (if file is larger than that only the first 10kb will be shown)
39
+ DEFAULT_OUTPUT_FILE = 'PROMPT' # default output base name (extension added via --format)
40
+ DEFAULT_FORMAT = 'markdown' # default output format
41
+
42
+ # Mapping of format types to their respective file extensions
43
+ SUPPORTED_FORMATS = {
44
+ 'xml': '.xml',
45
+ 'markdown': '.md'
46
+ }
47
+
48
+ # A unique identifier added to the top of every generated file to prevent recursive scanning.
49
+ GENERATION_FLAG = "DATA2PROMPT_GENERATED_CONTENT"
50
+
51
+ # --- LLM Structured Output Constants ---
52
+ # Refactored System Instructions (Repomix Style)
53
+ SYSTEM_INSTRUCTIONS_MARKDOWN = """## purpose: \nThis document is a structured representation of a codebase and data schema. It is designed to be consumed by a Large Language Model.
54
+ The output is organized into sections:
55
+ 1. Directory Structure: List of all files in this project.
56
+ 2. Files: The content of each file, clearly labeled with its path using '## File: {path}' headers.
57
+ For all standard files, content is wrapped in markdown code blocks using dynamic backtick depth to ensure robust nesting.
58
+ For notebooks, individual cells are clearly labeled with cell numbers, types, and their respective file paths.
59
+ For Excel files, individual sheets are clearly labeled with sheet names, numbers, and their respective file paths."""
60
+
61
+ SYSTEM_INSTRUCTIONS_XML = """<purpose>\nThis document is a structured representation of a codebase and data schema. It is designed to be consumed by a Large Language Model.
62
+ The output is organized into XML tags:
63
+ 1. <directory_structure>: List of all files in this project.
64
+ 2. <files>: Contains the repository's files.
65
+ 3. <file>: Represents a single file with a 'path' attribute.
66
+ 4. <cell>: Used within notebooks to encapsulate individual cells, featuring 'path', 'number', and 'type' attributes.
67
+ 5. <sheet>: Used within Excel files to encapsulate individual sheets, featuring 'name', 'number', and 'path' attributes.\n</purpose>"""
68
+
69
+ # Updated Tags
70
+ TAG_DIRECTORY_STRUCTURE = "directory_structure"
71
+ TAG_FILES = "files"
72
+ TAG_FILE = "file"
73
+ TAG_CONTENT = "content" # Used for notebook cells
74
+
75
+ # --- UI & Aesthetic Constants ---
76
+ MATRIX_DARK_GREEN = (0, 150, 0)
77
+ MATRIX_NEON_GREEN = (0, 255, 0)
78
+ STARTUP_ANIMATION_DURATION = 0.9
79
+ ANIMATION_FRAME_DELAY = 0.03
80
+
81
+ # Scroll Bar Characters
82
+ SCROLL_THUMB = "█"
83
+ SCROLL_TRACK = "│"
84
+
85
+ # ASCII Art for the application header
86
+ ASCII_ART = [
87
+ " ",
88
+ " ██╗ ██████╗ █████╗ ████████╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗ ███╗ ███╗ ██████╗ ████████╗",
89
+ " ╚██╗ ██╔══██╗ ██╔══██╗ ╚══██╔══╝ ██╔══██╗ ╚════██╗ ██╔══██╗ ██╔══██╗ ██╔═══██╗ ████╗ ████║ ██╔══██╗ ╚══██╔══╝",
90
+ " ╚██╗ ██║ ██║ ███████║ ██║ ███████║ █████╔╝ ██████╔╝ ██████╔╝ ██║ ██║ ██╔████╔██║ ██████╔╝ ██║ ",
91
+ " ██╔╝ ██║ ██║ ██╔══██║ ██║ ██╔══██║ ██╔═══╝ ██╔═══╝ ██╔══██╗ ██║ ██║ ██║╚██╔╝██║ ██╔═══╝ ██║ ",
92
+ " ██╔╝ ██████╔╝ ██║ ██║ ██║ ██║ ██║ ███████╗ ██║ ██║ ██║ ╚██████╔╝ ██║ ╚═╝ ██║ ██║ ██║ ",
93
+ " ╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ "
94
+ ]
data2prompt/main.py ADDED
@@ -0,0 +1,172 @@
1
+ import warnings
2
+
3
+ # Suppress known noisy warnings globally for a cleaner TUI experience
4
+ # We do this before importing pandas to ensure the filters are in place
5
+ warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
6
+
7
+ import pandas as pd
8
+ # Now that pandas is imported, we can reference its error types
9
+ warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
10
+ from pathlib import Path
11
+ from typing import Set
12
+ from .cli import setup_cli, Config
13
+ from .parsers import registry, ParserResult, flatten_ir
14
+ from .utils import ProjectScanner, count_tokens, check_connectivity
15
+ from .ui import ui
16
+ from .output import get_generator
17
+
18
+ def get_ui_action(ext: str, skip_exts: Set[str]) -> str:
19
+ """Determines the UI action string based on file extension."""
20
+ if ext in skip_exts: return "Skipping"
21
+ elif ext == '.csv': return "Sampling"
22
+ elif ext == '.ipynb': return "Cleaning"
23
+ elif ext == '.sql': return "Parsing"
24
+ elif ext in ['.xlsx', '.xls']: return "Extracting"
25
+ return "Reading"
26
+
27
+ def process_target_file(file_path: Path, config: Config) -> ParserResult:
28
+ """Handles a single file and returns its content, tokens, and metadata."""
29
+ ext = file_path.suffix.lower()
30
+
31
+ if ext in config.skip_exts:
32
+ return ParserResult(
33
+ content=f"*Note: Content skipped for ({ext}) file based on exclusion rules.*\n",
34
+ tokens=0,
35
+ type=f"Excluded ({ext})",
36
+ status="Skipped (Exclusion)",
37
+ stats_update={"excluded_count": 1}
38
+ )
39
+
40
+ parser = registry.get_parser(ext)
41
+ return parser.parse(file_path, config)
42
+
43
+ def main():
44
+ """
45
+ The main entry point for the Data2Prompt CLI.
46
+ Orchestrates the argument parsing, file discovery, content processing, and Markdown generation.
47
+ """
48
+ config = setup_cli() # Retrieve user settings from the terminal
49
+
50
+ project_path = Path.cwd()
51
+ scanner = ProjectScanner(
52
+ project_path=project_path,
53
+ ignore_folders=config.ignore_folders,
54
+ ignore_files=config.ignore_files,
55
+ output_file=config.output,
56
+ use_gitignore=config.use_gitignore
57
+ )
58
+
59
+ # Collect all files first to set progress bar total
60
+ all_files = scanner.scan()
61
+ total_steps = 1 + 1 + len(all_files) + 1
62
+
63
+ # Initialize UI and start process
64
+ ui.on_start("[cyan]Starting process...[/cyan]", total=total_steps)
65
+
66
+ stats = {
67
+ "file_count": 0,
68
+ "csv_count": 0,
69
+ "notebook_count": 0,
70
+ "sql_count": 0,
71
+ "excel_count": 0,
72
+ "excel_sheets_count": 0,
73
+ "truncated_count": 0,
74
+ "binary_count": 0,
75
+ "excluded_count": 0
76
+ }
77
+
78
+ # For the summary table
79
+ processed_files_info = []
80
+
81
+ with ui.progress_bar("[cyan]Starting process...[/cyan]", total=total_steps) as handler:
82
+ # 1. Checking connectivity
83
+ handler.on_progress("[cyan]Checking online connectivity...[/cyan]")
84
+ is_online = check_connectivity()
85
+ status_msg = "[green]Online[/green]" if is_online else "[yellow]Offline (using fallback)[/yellow]"
86
+ handler.on_progress(f"[cyan]Checking online connectivity... {status_msg}[/cyan]", advance=1)
87
+
88
+ # 2. Generating project tree
89
+ handler.on_progress("[cyan]Generating project tree...[/cyan]")
90
+ tree_text = scanner.generate_tree()
91
+ handler.on_progress("[cyan]Generating project tree...[/cyan]", advance=1)
92
+
93
+ # 2. Processing files
94
+ files_data = []
95
+
96
+ for file_path in all_files:
97
+ relative_path = file_path.relative_to(project_path)
98
+ ext = file_path.suffix.lower()
99
+ stats["file_count"] += 1
100
+
101
+ # Determine action for progress bar - show only filename
102
+ action = get_ui_action(ext, config.skip_exts)
103
+ handler.on_progress(f"[cyan]{action}[/cyan] [bold]{file_path.name}[/bold] [cyan]...[/cyan]")
104
+
105
+ result = process_target_file(file_path, config)
106
+ if result.skip_file:
107
+ handler.on_progress(f"[cyan]{action}[/cyan] [bold]{file_path.name}[/bold] [cyan]...[/cyan]", advance=1)
108
+ continue
109
+
110
+ # Collect file data for the generator
111
+ files_data.append({
112
+ "path": str(relative_path),
113
+ "content": result.content,
114
+ "type": result.type,
115
+ "tokens": result.tokens,
116
+ "status": result.status
117
+ })
118
+
119
+ # Update stats
120
+ for key, value in result.stats_update.items():
121
+ stats[key] += value
122
+
123
+ processed_files_info.append({
124
+ "name": str(relative_path),
125
+ "type": result.type,
126
+ "tokens": result.tokens,
127
+ "status": result.status
128
+ })
129
+
130
+ handler.on_progress(f"[cyan]{action}[/cyan] [bold]{file_path.name}[/bold] [cyan]...[/cyan]", advance=1)
131
+
132
+ # 3. Compiling project context
133
+ handler.on_progress("[cyan]Compiling project context...[/cyan]")
134
+
135
+ # We need a temporary token count for the final report
136
+ # The generator will handle the final string construction
137
+ # We use flatten_ir to convert structured content to strings for token counting
138
+ temp_content = "\n".join([flatten_ir(f["content"]) for f in files_data]) + tree_text
139
+ total_tokens, method = count_tokens(temp_content)
140
+
141
+ generator = get_generator(config.format)
142
+ final_output = generator.generate(
143
+ project_name=project_path.name,
144
+ tree_text=tree_text,
145
+ files_data=files_data,
146
+ stats=stats,
147
+ total_tokens=total_tokens,
148
+ token_method=method,
149
+ config=config
150
+ )
151
+
152
+ with open(config.output, 'w', encoding='utf-8') as f:
153
+ f.write(final_output)
154
+ handler.on_progress("[cyan]Compiling project context...[/cyan]", advance=1)
155
+
156
+ # Final File Size Check
157
+ file_size_kb = Path(config.output).stat().st_size / 1024
158
+
159
+ # Display Final Report (Interactive Summary + Success Panel)
160
+ ui.print_final_report(processed_files_info, config.output, file_size_kb, total_tokens, stats, method)
161
+
162
+ if file_size_kb > 2000:
163
+ ui.print_warning_panel(
164
+ "[bold yellow]WARNING:[/bold yellow] File is over 2MB. This might be too large for some context windows.\n"
165
+ "[bold cyan]Suggestion:[/bold cyan] Reduce --csv-sample-size, --sql-sample-size or --max-lines."
166
+ )
167
+
168
+ if __name__ == "__main__":
169
+ main()
170
+
171
+ # Alias for backward compatibility with stale entry point scripts
172
+ run_packager = main
data2prompt/output.py ADDED
@@ -0,0 +1,235 @@
1
+ import os
2
+ import pandas as pd
3
+ from abc import ABC, abstractmethod
4
+ from typing import List, Dict, Any, TYPE_CHECKING
5
+ from pathlib import Path
6
+ from xml.sax.saxutils import escape, quoteattr
7
+
8
+ if TYPE_CHECKING:
9
+ from .cli import Config
10
+
11
+ from .constants import (
12
+ TAG_DIRECTORY_STRUCTURE,
13
+ TAG_FILES,
14
+ TAG_FILE,
15
+ TAG_CONTENT,
16
+ SYSTEM_INSTRUCTIONS_MARKDOWN,
17
+ SYSTEM_INSTRUCTIONS_XML,
18
+ GENERATION_FLAG
19
+ )
20
+ from .utils import get_dynamic_wrapper
21
+ from .parsers import NotebookCellIR, TableIR, enforce_table_limit
22
+
23
+ class OutputGenerator(ABC):
24
+ @abstractmethod
25
+ def generate(self,
26
+ project_name: str,
27
+ tree_text: str,
28
+ files_data: List[Dict[str, Any]],
29
+ stats: Dict[str, Any],
30
+ total_tokens: int,
31
+ token_method: str,
32
+ config: 'Config' = None) -> str:
33
+ pass
34
+
35
+ class MarkdownGenerator(OutputGenerator):
36
+ def generate(self,
37
+ project_name: str,
38
+ tree_text: str,
39
+ files_data: List[Dict[str, Any]],
40
+ stats: Dict[str, Any],
41
+ total_tokens: int,
42
+ token_method: str,
43
+ config: 'Config' = None) -> str:
44
+
45
+ timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')
46
+ method_label = "o200k_base" if token_method == "o200k_base" else "regex_fallback" if token_method == "regex_fallback" else "word_count"
47
+
48
+ lines = [
49
+ f"<!-- {GENERATION_FLAG} -->",
50
+ "",
51
+ f"# codebase: {project_name}",
52
+ "",
53
+ SYSTEM_INSTRUCTIONS_MARKDOWN,
54
+ "",
55
+ f"> Generated on: {timestamp}",
56
+ f"> Tokens: {total_tokens} (est. via {method_label})",
57
+ "",
58
+ "# Directory Structure",
59
+ "```text",
60
+ tree_text,
61
+ "```",
62
+ "",
63
+ "# Files",
64
+ "",
65
+ "This section contains the contents of the repository's files.",
66
+ ""
67
+ ]
68
+
69
+ for file_info in files_data:
70
+ rel_path = file_info['path']
71
+ # Normalize path to match directory structure (always use backslashes)
72
+ display_path = rel_path.replace(os.sep, '\\')
73
+ content = file_info['content']
74
+ ext = Path(rel_path).suffix.lower()
75
+
76
+ lines.append(f"## File: {display_path}")
77
+
78
+ if isinstance(content, list) and content and isinstance(content[0], NotebookCellIR):
79
+ # Render Notebook IR
80
+ for cell in content:
81
+ lines.append(f"### Cell {cell.number} ({cell.type}) - {display_path}")
82
+ wrapper = get_dynamic_wrapper(cell.source)
83
+ lang = 'python' if cell.type == 'code' else 'markdown'
84
+ lines.append(f"{wrapper}{lang}")
85
+ lines.append(cell.source)
86
+ lines.append(wrapper)
87
+
88
+ if cell.outputs:
89
+ lines.append("\n**Outputs:**")
90
+ lines.append("```text")
91
+ lines.append(cell.outputs)
92
+ lines.append("```")
93
+ lines.append("")
94
+
95
+ elif isinstance(content, list) and content and isinstance(content[0], TableIR):
96
+ # Render Table IR (CSV/Excel)
97
+ for table in content:
98
+ table_parts = []
99
+
100
+ # Handle Excel Sheet Metadata
101
+ if table.sheet_number is not None:
102
+ lines.append(f"### Sheet {table.sheet_number}: {table.name} - {table.file_path}")
103
+
104
+ if table.header_note:
105
+ table_parts.append(table.header_note)
106
+
107
+ if not table.df.empty:
108
+ table_parts.append(table.df.to_markdown(index=False))
109
+
110
+ if table.footer_note:
111
+ table_parts.append(table.footer_note)
112
+
113
+ table_text = "\n".join(table_parts)
114
+ if config:
115
+ table_text = enforce_table_limit(table_text, config.table_limit, config.table_truncate)
116
+
117
+ lines.append(table_text)
118
+
119
+ # Close Sheet block if applicable
120
+ if table.sheet_number is not None:
121
+ lines.append("---")
122
+
123
+ lines.append("")
124
+
125
+ else:
126
+ # Standard files or fallback string content
127
+ str_content = str(content)
128
+ wrapper = get_dynamic_wrapper(str_content)
129
+ lang = ext[1:] if ext and ext != '.md' else 'markdown' if ext == '.md' else 'text'
130
+ lines.append(f"{wrapper}{lang}")
131
+ lines.append(str_content)
132
+ lines.append(wrapper)
133
+
134
+ lines.append("")
135
+
136
+ return "\n".join(lines)
137
+
138
+ class XMLGenerator(OutputGenerator):
139
+ def generate(self,
140
+ project_name: str,
141
+ tree_text: str,
142
+ files_data: List[Dict[str, Any]],
143
+ stats: Dict[str, Any],
144
+ total_tokens: int,
145
+ token_method: str,
146
+ config: 'Config' = None) -> str:
147
+
148
+ timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')
149
+ method_label = "o200k_base" if token_method == "o200k_base" else "regex_fallback" if token_method == "regex_fallback" else "word_count"
150
+
151
+ lines = [
152
+ f"<!-- {GENERATION_FLAG} -->",
153
+ "",
154
+ f'<codebase name={quoteattr(project_name)}>',
155
+ "",
156
+ SYSTEM_INSTRUCTIONS_XML,
157
+ "",
158
+ "<metadata>",
159
+ f" <generated_on>{timestamp}</generated_on>",
160
+ f' <total_tokens method="{method_label}">{total_tokens}</total_tokens>',
161
+ "</metadata>",
162
+ "",
163
+ f"<{TAG_DIRECTORY_STRUCTURE}>",
164
+ escape(tree_text),
165
+ f"</{TAG_DIRECTORY_STRUCTURE}>",
166
+ "",
167
+ f"<{TAG_FILES}>",
168
+ "This section contains the contents of the repository's files.",
169
+ ""
170
+ ]
171
+
172
+ for file_info in files_data:
173
+ rel_path = file_info['path']
174
+ # Normalize path to match directory structure (always use backslashes)
175
+ display_path = rel_path.replace(os.sep, '\\')
176
+ content = file_info['content']
177
+
178
+ lines.append(f'<{TAG_FILE} path="{display_path}">')
179
+
180
+ if isinstance(content, list) and content and isinstance(content[0], NotebookCellIR):
181
+ # Render Notebook IR to XML
182
+ for cell in content:
183
+ lines.append(f' <cell path="{display_path}" index="{cell.number}" type="{cell.type}">')
184
+ lines.append(f' <{TAG_CONTENT}>')
185
+ lines.append(escape(cell.source))
186
+ lines.append(f' </{TAG_CONTENT}>')
187
+ if cell.outputs:
188
+ lines.append(' <outputs>')
189
+ lines.append(escape(cell.outputs))
190
+ lines.append(' </outputs>')
191
+ lines.append(' </cell>')
192
+
193
+ elif isinstance(content, list) and content and isinstance(content[0], TableIR):
194
+ # Render Table IR to XML
195
+ for table in content:
196
+ # Handle Excel Sheet Metadata
197
+ if table.sheet_number is not None:
198
+ lines.append(f'<sheet name="{table.name}" sheet_number="{table.sheet_number}" path="{table.file_path}">')
199
+
200
+ table_parts = []
201
+ if table.header_note:
202
+ table_parts.append(table.header_note)
203
+
204
+ if not table.df.empty:
205
+ table_parts.append(table.df.to_markdown(index=False))
206
+
207
+ if table.footer_note:
208
+ table_parts.append(table.footer_note)
209
+
210
+ table_text = "\n".join(table_parts)
211
+ if config:
212
+ table_text = enforce_table_limit(table_text, config.table_limit, config.table_truncate)
213
+
214
+ lines.append(escape(table_text))
215
+
216
+ # Close Sheet block if applicable
217
+ if table.sheet_number is not None:
218
+ lines.append('</sheet>')
219
+
220
+ else:
221
+ # Standard files or fallback string content
222
+ lines.append(str(content))
223
+
224
+ lines.append(f"</{TAG_FILE}>")
225
+ lines.append("")
226
+
227
+ lines.append(f"</{TAG_FILES}>")
228
+ lines.append("</codebase>")
229
+
230
+ return "\n".join(lines)
231
+
232
+ def get_generator(format_type: str) -> OutputGenerator:
233
+ if format_type.lower() == 'markdown':
234
+ return MarkdownGenerator()
235
+ return XMLGenerator()