data2prompt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data2prompt/__init__.py +0 -0
- data2prompt/cli.py +146 -0
- data2prompt/constants.py +94 -0
- data2prompt/main.py +172 -0
- data2prompt/output.py +235 -0
- data2prompt/parsers.py +579 -0
- data2prompt/ui.py +357 -0
- data2prompt/utils.py +222 -0
- data2prompt-0.1.0.dist-info/METADATA +166 -0
- data2prompt-0.1.0.dist-info/RECORD +14 -0
- data2prompt-0.1.0.dist-info/WHEEL +5 -0
- data2prompt-0.1.0.dist-info/entry_points.txt +2 -0
- data2prompt-0.1.0.dist-info/licenses/LICENSE +21 -0
- data2prompt-0.1.0.dist-info/top_level.txt +1 -0
data2prompt/__init__.py
ADDED
|
File without changes
|
data2prompt/cli.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import List, Set
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from argparse import Namespace
|
|
6
|
+
|
|
7
|
+
from .constants import (
|
|
8
|
+
CORE_IGNORES,
|
|
9
|
+
CORE_IGNORE_FILES,
|
|
10
|
+
CORE_SKIP_EXTS,
|
|
11
|
+
DEFAULT_CSV_SAMPLE_SIZE,
|
|
12
|
+
DEFAULT_SQL_SAMPLE_SIZE,
|
|
13
|
+
DEFAULT_SQL_MAX_LINES,
|
|
14
|
+
DEFAULT_MAX_LINES,
|
|
15
|
+
DEFAULT_MAX_SHEETS,
|
|
16
|
+
DEFAULT_SEED,
|
|
17
|
+
DEFAULT_LINE_LENGTH_THRESHOLD,
|
|
18
|
+
DEFAULT_TRUNCATED_LINE_LENGTH,
|
|
19
|
+
DEFAULT_TABLE_CHAR_LIMIT,
|
|
20
|
+
DEFAULT_TABLE_TRUNCATED_SIZE,
|
|
21
|
+
DEFAULT_MAX_FILE_SIZE_KB,
|
|
22
|
+
DEFAULT_OUTPUT_FILE,
|
|
23
|
+
DEFAULT_FORMAT,
|
|
24
|
+
SUPPORTED_FORMATS
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Config:
|
|
29
|
+
"""Data Transfer Object for application configuration."""
|
|
30
|
+
output: str
|
|
31
|
+
format: str
|
|
32
|
+
csv_sample_size: int
|
|
33
|
+
seed: int
|
|
34
|
+
sql_sample_size: int
|
|
35
|
+
sql_max_lines: int
|
|
36
|
+
max_lines: int
|
|
37
|
+
max_sheets: int
|
|
38
|
+
line_length_threshold: int
|
|
39
|
+
truncated_line_length: int
|
|
40
|
+
table_limit: int
|
|
41
|
+
table_truncate: int
|
|
42
|
+
ignore_folders: Set[str] = field(default_factory=set)
|
|
43
|
+
ignore_files: Set[str] = field(default_factory=set)
|
|
44
|
+
max_file_size: int = 0
|
|
45
|
+
skip_exts: Set[str] = field(default_factory=set)
|
|
46
|
+
use_gitignore: bool = True
|
|
47
|
+
|
|
48
|
+
def setup_cli() -> Config:
|
|
49
|
+
"""Configures the Command Line Interface (CLI) for the tool.
|
|
50
|
+
|
|
51
|
+
Defines all available flags and their help descriptions.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Config: A type-safe configuration object.
|
|
55
|
+
"""
|
|
56
|
+
parser = argparse.ArgumentParser(
|
|
57
|
+
description="📊 Data2Prompt: High-tech prompt packaging for Data Scientists."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Output settings
|
|
61
|
+
parser.add_argument('-o', '--output', default=DEFAULT_OUTPUT_FILE,
|
|
62
|
+
help=f'Base name of the generated file (default: {DEFAULT_OUTPUT_FILE})')
|
|
63
|
+
|
|
64
|
+
parser.add_argument('-f', '--format', choices=list(SUPPORTED_FORMATS.keys()), default=DEFAULT_FORMAT,
|
|
65
|
+
help=f'Output format: xml or markdown (default: {DEFAULT_FORMAT})')
|
|
66
|
+
|
|
67
|
+
# CSV sampling settings
|
|
68
|
+
parser.add_argument('-s', '--csv-sample-size', type=int, default=DEFAULT_CSV_SAMPLE_SIZE,
|
|
69
|
+
help=f'Number of random rows to sample from CSVs (default: {DEFAULT_CSV_SAMPLE_SIZE})')
|
|
70
|
+
parser.add_argument('--seed', type=int, default=DEFAULT_SEED,
|
|
71
|
+
help=f'Random seed for consistent CSV sampling (default: {DEFAULT_SEED})')
|
|
72
|
+
|
|
73
|
+
# SQL sampling settings
|
|
74
|
+
parser.add_argument('--sql-sample-size', type=int, default=DEFAULT_SQL_SAMPLE_SIZE,
|
|
75
|
+
help=f'Number of INSERT statements to keep in SQL files (default: {DEFAULT_SQL_SAMPLE_SIZE})')
|
|
76
|
+
|
|
77
|
+
parser.add_argument('--sql-max-lines', type=int, default=DEFAULT_SQL_MAX_LINES,
|
|
78
|
+
help=f'Max non-data lines to keep in SQL files (default: {DEFAULT_SQL_MAX_LINES})')
|
|
79
|
+
|
|
80
|
+
# Notebook settings
|
|
81
|
+
parser.add_argument('--max-lines', type=int, default=DEFAULT_MAX_LINES,
|
|
82
|
+
help=f'Max lines of text output to keep per notebook cell (default: {DEFAULT_MAX_LINES})')
|
|
83
|
+
|
|
84
|
+
# Excel settings
|
|
85
|
+
parser.add_argument('--max-sheets', type=int, default=DEFAULT_MAX_SHEETS,
|
|
86
|
+
help=f'Max number of sheets to process in Excel files (default: {DEFAULT_MAX_SHEETS})')
|
|
87
|
+
|
|
88
|
+
# Line Truncation settings
|
|
89
|
+
parser.add_argument('--line-length-threshold', type=int, default=DEFAULT_LINE_LENGTH_THRESHOLD,
|
|
90
|
+
help=f'Max characters per line before truncation (default: {DEFAULT_LINE_LENGTH_THRESHOLD})')
|
|
91
|
+
parser.add_argument('--truncated-line-length', type=int, default=DEFAULT_TRUNCATED_LINE_LENGTH,
|
|
92
|
+
help=f'Length to truncate long lines to (default: {DEFAULT_TRUNCATED_LINE_LENGTH})')
|
|
93
|
+
|
|
94
|
+
# Table Truncation settings
|
|
95
|
+
parser.add_argument('--table-limit', type=int, default=DEFAULT_TABLE_CHAR_LIMIT,
|
|
96
|
+
help=f'Max characters for a single table/sheet after sampling (default: {DEFAULT_TABLE_CHAR_LIMIT})')
|
|
97
|
+
parser.add_argument('--table-truncate', type=int, default=DEFAULT_TABLE_TRUNCATED_SIZE,
|
|
98
|
+
help=f'Length to truncate large tables to (default: {DEFAULT_TABLE_TRUNCATED_SIZE})')
|
|
99
|
+
|
|
100
|
+
# Exclusions
|
|
101
|
+
parser.add_argument('--ignore-folders', nargs='+', default=[],
|
|
102
|
+
help='Additional folders to skip entirely')
|
|
103
|
+
|
|
104
|
+
parser.add_argument('--ignore-files', nargs='+', default=[],
|
|
105
|
+
help='Additional files to skip entirely')
|
|
106
|
+
|
|
107
|
+
parser.add_argument('--max-file-size', type=int, default=DEFAULT_MAX_FILE_SIZE_KB,
|
|
108
|
+
help=f'Max file size in KB to read entirely (default: {DEFAULT_MAX_FILE_SIZE_KB}KB)')
|
|
109
|
+
|
|
110
|
+
# file formats to ignore
|
|
111
|
+
parser.add_argument('--skip-exts', nargs='+', default=[],
|
|
112
|
+
help='Additional file extensions to skip content for')
|
|
113
|
+
|
|
114
|
+
parser.add_argument('--no-gitignore', action='store_false', dest='use_gitignore',
|
|
115
|
+
help='Disable automatic .gitignore detection and filtering')
|
|
116
|
+
|
|
117
|
+
args = parser.parse_args()
|
|
118
|
+
|
|
119
|
+
# --- Argument Merging Logic ---
|
|
120
|
+
# We combine the user's terminal input with our CORE constants.
|
|
121
|
+
# This ensures that even if a user provides custom ignores, essential items
|
|
122
|
+
# like '.git' or binary extensions are still respected.
|
|
123
|
+
|
|
124
|
+
# Combine base name with format-specific extension
|
|
125
|
+
extension = SUPPORTED_FORMATS.get(args.format, SUPPORTED_FORMATS.get(DEFAULT_FORMAT))
|
|
126
|
+
final_output_name = f"{args.output}{extension}"
|
|
127
|
+
|
|
128
|
+
return Config(
|
|
129
|
+
output=final_output_name,
|
|
130
|
+
format=args.format,
|
|
131
|
+
csv_sample_size=args.csv_sample_size,
|
|
132
|
+
seed=args.seed,
|
|
133
|
+
sql_sample_size=args.sql_sample_size,
|
|
134
|
+
sql_max_lines=args.sql_max_lines,
|
|
135
|
+
max_lines=args.max_lines,
|
|
136
|
+
max_sheets=args.max_sheets,
|
|
137
|
+
line_length_threshold=args.line_length_threshold,
|
|
138
|
+
truncated_line_length=args.truncated_line_length,
|
|
139
|
+
table_limit=args.table_limit,
|
|
140
|
+
table_truncate=args.table_truncate,
|
|
141
|
+
ignore_folders=set(args.ignore_folders) | CORE_IGNORES,
|
|
142
|
+
ignore_files=set(args.ignore_files) | CORE_IGNORE_FILES,
|
|
143
|
+
max_file_size=args.max_file_size,
|
|
144
|
+
skip_exts=set(args.skip_exts) | CORE_SKIP_EXTS,
|
|
145
|
+
use_gitignore=args.use_gitignore
|
|
146
|
+
)
|
data2prompt/constants.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# --- Core Defaults & Constants ---
|
|
2
|
+
|
|
3
|
+
# Folders matching these names are excluded from both the project tree and content processing.
|
|
4
|
+
CORE_IGNORES = {
|
|
5
|
+
'.git', '__pycache__', 'venv', '.vscode', '.ipynb_checkpoints',
|
|
6
|
+
'node_modules', '.idea', 'dist', 'build', '.mypy_cache',
|
|
7
|
+
'.pytest_cache', 'target', '.docker', '.aws', '.gcloud',
|
|
8
|
+
'__MACOSX'
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
# Specific filenames that should be excluded from the entire process.
|
|
12
|
+
CORE_IGNORE_FILES = set()
|
|
13
|
+
|
|
14
|
+
# Files with these extensions will have their names listed in the project tree,
|
|
15
|
+
# but their actual content will be skipped.
|
|
16
|
+
CORE_SKIP_EXTS = {
|
|
17
|
+
# Data & Databases
|
|
18
|
+
'.pbix', '.db', '.sqlite', '.sqlite3', '.parquet', '.pkl', '.pickle', '.feather', '.h5',
|
|
19
|
+
# Compressed & Binary
|
|
20
|
+
'.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.bin',
|
|
21
|
+
# Media
|
|
22
|
+
'.png', '.jpg', '.jpeg', '.gif', '.svg', '.pdf', '.mp4', '.mp3', '.mov',
|
|
23
|
+
# Environment & Secrets
|
|
24
|
+
'.env', '.venv', '.pyc', '.ds_store'
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Default values for CLI arguments and processing functions
|
|
28
|
+
DEFAULT_CSV_SAMPLE_SIZE = 15 # Controls the number of rows per csv file.
|
|
29
|
+
DEFAULT_SQL_SAMPLE_SIZE = 15 # Controls the number of INSERT/data rows kept per table in SQL files.
|
|
30
|
+
DEFAULT_SQL_MAX_LINES = 50 # Caps the total number of non-data lines (comments, setup, etc.) in SQL files.
|
|
31
|
+
DEFAULT_MAX_LINES = 40 # Max lines of text output to keep per notebook cell.
|
|
32
|
+
DEFAULT_MAX_SHEETS = 10 # Max number of sheets to process in Excel files.
|
|
33
|
+
DEFAULT_SEED = 42 # Random seed for consistent sampling.
|
|
34
|
+
DEFAULT_LINE_LENGTH_THRESHOLD = 4000 # Max characters allowed per line before truncation is triggered.
|
|
35
|
+
DEFAULT_TRUNCATED_LINE_LENGTH = 1000 # Number of characters to keep when a line is truncated.
|
|
36
|
+
DEFAULT_TABLE_CHAR_LIMIT = 50000 # Max characters allowed for a single table/sheet representation after sampling.
|
|
37
|
+
DEFAULT_TABLE_TRUNCATED_SIZE = 20000 # Number of characters to keep when a table/sheet is truncated due to size.
|
|
38
|
+
DEFAULT_MAX_FILE_SIZE_KB = 70 # maximum file size of unhandled type to keep enitrely (if file is larger than that only the first 10kb will be shown)
|
|
39
|
+
DEFAULT_OUTPUT_FILE = 'PROMPT' # default output base name (extension added via --format)
|
|
40
|
+
DEFAULT_FORMAT = 'markdown' # default output format
|
|
41
|
+
|
|
42
|
+
# Mapping of format types to their respective file extensions
|
|
43
|
+
SUPPORTED_FORMATS = {
|
|
44
|
+
'xml': '.xml',
|
|
45
|
+
'markdown': '.md'
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# A unique identifier added to the top of every generated file to prevent recursive scanning.
|
|
49
|
+
GENERATION_FLAG = "DATA2PROMPT_GENERATED_CONTENT"
|
|
50
|
+
|
|
51
|
+
# --- LLM Structured Output Constants ---
|
|
52
|
+
# Refactored System Instructions (Repomix Style)
|
|
53
|
+
SYSTEM_INSTRUCTIONS_MARKDOWN = """## purpose: \nThis document is a structured representation of a codebase and data schema. It is designed to be consumed by a Large Language Model.
|
|
54
|
+
The output is organized into sections:
|
|
55
|
+
1. Directory Structure: List of all files in this project.
|
|
56
|
+
2. Files: The content of each file, clearly labeled with its path using '## File: {path}' headers.
|
|
57
|
+
For all standard files, content is wrapped in markdown code blocks using dynamic backtick depth to ensure robust nesting.
|
|
58
|
+
For notebooks, individual cells are clearly labeled with cell numbers, types, and their respective file paths.
|
|
59
|
+
For Excel files, individual sheets are clearly labeled with sheet names, numbers, and their respective file paths."""
|
|
60
|
+
|
|
61
|
+
SYSTEM_INSTRUCTIONS_XML = """<purpose>\nThis document is a structured representation of a codebase and data schema. It is designed to be consumed by a Large Language Model.
|
|
62
|
+
The output is organized into XML tags:
|
|
63
|
+
1. <directory_structure>: List of all files in this project.
|
|
64
|
+
2. <files>: Contains the repository's files.
|
|
65
|
+
3. <file>: Represents a single file with a 'path' attribute.
|
|
66
|
+
4. <cell>: Used within notebooks to encapsulate individual cells, featuring 'path', 'number', and 'type' attributes.
|
|
67
|
+
5. <sheet>: Used within Excel files to encapsulate individual sheets, featuring 'name', 'number', and 'path' attributes.\n</purpose>"""
|
|
68
|
+
|
|
69
|
+
# Updated Tags
|
|
70
|
+
TAG_DIRECTORY_STRUCTURE = "directory_structure"
|
|
71
|
+
TAG_FILES = "files"
|
|
72
|
+
TAG_FILE = "file"
|
|
73
|
+
TAG_CONTENT = "content" # Used for notebook cells
|
|
74
|
+
|
|
75
|
+
# --- UI & Aesthetic Constants ---
|
|
76
|
+
MATRIX_DARK_GREEN = (0, 150, 0)
|
|
77
|
+
MATRIX_NEON_GREEN = (0, 255, 0)
|
|
78
|
+
STARTUP_ANIMATION_DURATION = 0.9
|
|
79
|
+
ANIMATION_FRAME_DELAY = 0.03
|
|
80
|
+
|
|
81
|
+
# Scroll Bar Characters
|
|
82
|
+
SCROLL_THUMB = "█"
|
|
83
|
+
SCROLL_TRACK = "│"
|
|
84
|
+
|
|
85
|
+
# ASCII Art for the application header
|
|
86
|
+
ASCII_ART = [
|
|
87
|
+
" ",
|
|
88
|
+
" ██╗ ██████╗ █████╗ ████████╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗ ███╗ ███╗ ██████╗ ████████╗",
|
|
89
|
+
" ╚██╗ ██╔══██╗ ██╔══██╗ ╚══██╔══╝ ██╔══██╗ ╚════██╗ ██╔══██╗ ██╔══██╗ ██╔═══██╗ ████╗ ████║ ██╔══██╗ ╚══██╔══╝",
|
|
90
|
+
" ╚██╗ ██║ ██║ ███████║ ██║ ███████║ █████╔╝ ██████╔╝ ██████╔╝ ██║ ██║ ██╔████╔██║ ██████╔╝ ██║ ",
|
|
91
|
+
" ██╔╝ ██║ ██║ ██╔══██║ ██║ ██╔══██║ ██╔═══╝ ██╔═══╝ ██╔══██╗ ██║ ██║ ██║╚██╔╝██║ ██╔═══╝ ██║ ",
|
|
92
|
+
" ██╔╝ ██████╔╝ ██║ ██║ ██║ ██║ ██║ ███████╗ ██║ ██║ ██║ ╚██████╔╝ ██║ ╚═╝ ██║ ██║ ██║ ",
|
|
93
|
+
" ╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ "
|
|
94
|
+
]
|
data2prompt/main.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
|
|
3
|
+
# Suppress known noisy warnings globally for a cleaner TUI experience
|
|
4
|
+
# We do this before importing pandas to ensure the filters are in place
|
|
5
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
# Now that pandas is imported, we can reference its error types
|
|
9
|
+
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Set
|
|
12
|
+
from .cli import setup_cli, Config
|
|
13
|
+
from .parsers import registry, ParserResult, flatten_ir
|
|
14
|
+
from .utils import ProjectScanner, count_tokens, check_connectivity
|
|
15
|
+
from .ui import ui
|
|
16
|
+
from .output import get_generator
|
|
17
|
+
|
|
18
|
+
def get_ui_action(ext: str, skip_exts: Set[str]) -> str:
|
|
19
|
+
"""Determines the UI action string based on file extension."""
|
|
20
|
+
if ext in skip_exts: return "Skipping"
|
|
21
|
+
elif ext == '.csv': return "Sampling"
|
|
22
|
+
elif ext == '.ipynb': return "Cleaning"
|
|
23
|
+
elif ext == '.sql': return "Parsing"
|
|
24
|
+
elif ext in ['.xlsx', '.xls']: return "Extracting"
|
|
25
|
+
return "Reading"
|
|
26
|
+
|
|
27
|
+
def process_target_file(file_path: Path, config: Config) -> ParserResult:
|
|
28
|
+
"""Handles a single file and returns its content, tokens, and metadata."""
|
|
29
|
+
ext = file_path.suffix.lower()
|
|
30
|
+
|
|
31
|
+
if ext in config.skip_exts:
|
|
32
|
+
return ParserResult(
|
|
33
|
+
content=f"*Note: Content skipped for ({ext}) file based on exclusion rules.*\n",
|
|
34
|
+
tokens=0,
|
|
35
|
+
type=f"Excluded ({ext})",
|
|
36
|
+
status="Skipped (Exclusion)",
|
|
37
|
+
stats_update={"excluded_count": 1}
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
parser = registry.get_parser(ext)
|
|
41
|
+
return parser.parse(file_path, config)
|
|
42
|
+
|
|
43
|
+
def main():
|
|
44
|
+
"""
|
|
45
|
+
The main entry point for the Data2Prompt CLI.
|
|
46
|
+
Orchestrates the argument parsing, file discovery, content processing, and Markdown generation.
|
|
47
|
+
"""
|
|
48
|
+
config = setup_cli() # Retrieve user settings from the terminal
|
|
49
|
+
|
|
50
|
+
project_path = Path.cwd()
|
|
51
|
+
scanner = ProjectScanner(
|
|
52
|
+
project_path=project_path,
|
|
53
|
+
ignore_folders=config.ignore_folders,
|
|
54
|
+
ignore_files=config.ignore_files,
|
|
55
|
+
output_file=config.output,
|
|
56
|
+
use_gitignore=config.use_gitignore
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Collect all files first to set progress bar total
|
|
60
|
+
all_files = scanner.scan()
|
|
61
|
+
total_steps = 1 + 1 + len(all_files) + 1
|
|
62
|
+
|
|
63
|
+
# Initialize UI and start process
|
|
64
|
+
ui.on_start("[cyan]Starting process...[/cyan]", total=total_steps)
|
|
65
|
+
|
|
66
|
+
stats = {
|
|
67
|
+
"file_count": 0,
|
|
68
|
+
"csv_count": 0,
|
|
69
|
+
"notebook_count": 0,
|
|
70
|
+
"sql_count": 0,
|
|
71
|
+
"excel_count": 0,
|
|
72
|
+
"excel_sheets_count": 0,
|
|
73
|
+
"truncated_count": 0,
|
|
74
|
+
"binary_count": 0,
|
|
75
|
+
"excluded_count": 0
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
# For the summary table
|
|
79
|
+
processed_files_info = []
|
|
80
|
+
|
|
81
|
+
with ui.progress_bar("[cyan]Starting process...[/cyan]", total=total_steps) as handler:
|
|
82
|
+
# 1. Checking connectivity
|
|
83
|
+
handler.on_progress("[cyan]Checking online connectivity...[/cyan]")
|
|
84
|
+
is_online = check_connectivity()
|
|
85
|
+
status_msg = "[green]Online[/green]" if is_online else "[yellow]Offline (using fallback)[/yellow]"
|
|
86
|
+
handler.on_progress(f"[cyan]Checking online connectivity... {status_msg}[/cyan]", advance=1)
|
|
87
|
+
|
|
88
|
+
# 2. Generating project tree
|
|
89
|
+
handler.on_progress("[cyan]Generating project tree...[/cyan]")
|
|
90
|
+
tree_text = scanner.generate_tree()
|
|
91
|
+
handler.on_progress("[cyan]Generating project tree...[/cyan]", advance=1)
|
|
92
|
+
|
|
93
|
+
# 2. Processing files
|
|
94
|
+
files_data = []
|
|
95
|
+
|
|
96
|
+
for file_path in all_files:
|
|
97
|
+
relative_path = file_path.relative_to(project_path)
|
|
98
|
+
ext = file_path.suffix.lower()
|
|
99
|
+
stats["file_count"] += 1
|
|
100
|
+
|
|
101
|
+
# Determine action for progress bar - show only filename
|
|
102
|
+
action = get_ui_action(ext, config.skip_exts)
|
|
103
|
+
handler.on_progress(f"[cyan]{action}[/cyan] [bold]{file_path.name}[/bold] [cyan]...[/cyan]")
|
|
104
|
+
|
|
105
|
+
result = process_target_file(file_path, config)
|
|
106
|
+
if result.skip_file:
|
|
107
|
+
handler.on_progress(f"[cyan]{action}[/cyan] [bold]{file_path.name}[/bold] [cyan]...[/cyan]", advance=1)
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Collect file data for the generator
|
|
111
|
+
files_data.append({
|
|
112
|
+
"path": str(relative_path),
|
|
113
|
+
"content": result.content,
|
|
114
|
+
"type": result.type,
|
|
115
|
+
"tokens": result.tokens,
|
|
116
|
+
"status": result.status
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
# Update stats
|
|
120
|
+
for key, value in result.stats_update.items():
|
|
121
|
+
stats[key] += value
|
|
122
|
+
|
|
123
|
+
processed_files_info.append({
|
|
124
|
+
"name": str(relative_path),
|
|
125
|
+
"type": result.type,
|
|
126
|
+
"tokens": result.tokens,
|
|
127
|
+
"status": result.status
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
handler.on_progress(f"[cyan]{action}[/cyan] [bold]{file_path.name}[/bold] [cyan]...[/cyan]", advance=1)
|
|
131
|
+
|
|
132
|
+
# 3. Compiling project context
|
|
133
|
+
handler.on_progress("[cyan]Compiling project context...[/cyan]")
|
|
134
|
+
|
|
135
|
+
# We need a temporary token count for the final report
|
|
136
|
+
# The generator will handle the final string construction
|
|
137
|
+
# We use flatten_ir to convert structured content to strings for token counting
|
|
138
|
+
temp_content = "\n".join([flatten_ir(f["content"]) for f in files_data]) + tree_text
|
|
139
|
+
total_tokens, method = count_tokens(temp_content)
|
|
140
|
+
|
|
141
|
+
generator = get_generator(config.format)
|
|
142
|
+
final_output = generator.generate(
|
|
143
|
+
project_name=project_path.name,
|
|
144
|
+
tree_text=tree_text,
|
|
145
|
+
files_data=files_data,
|
|
146
|
+
stats=stats,
|
|
147
|
+
total_tokens=total_tokens,
|
|
148
|
+
token_method=method,
|
|
149
|
+
config=config
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
with open(config.output, 'w', encoding='utf-8') as f:
|
|
153
|
+
f.write(final_output)
|
|
154
|
+
handler.on_progress("[cyan]Compiling project context...[/cyan]", advance=1)
|
|
155
|
+
|
|
156
|
+
# Final File Size Check
|
|
157
|
+
file_size_kb = Path(config.output).stat().st_size / 1024
|
|
158
|
+
|
|
159
|
+
# Display Final Report (Interactive Summary + Success Panel)
|
|
160
|
+
ui.print_final_report(processed_files_info, config.output, file_size_kb, total_tokens, stats, method)
|
|
161
|
+
|
|
162
|
+
if file_size_kb > 2000:
|
|
163
|
+
ui.print_warning_panel(
|
|
164
|
+
"[bold yellow]WARNING:[/bold yellow] File is over 2MB. This might be too large for some context windows.\n"
|
|
165
|
+
"[bold cyan]Suggestion:[/bold cyan] Reduce --csv-sample-size, --sql-sample-size or --max-lines."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
main()
|
|
170
|
+
|
|
171
|
+
# Alias for backward compatibility with stale entry point scripts
|
|
172
|
+
run_packager = main
|
data2prompt/output.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import List, Dict, Any, TYPE_CHECKING
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from xml.sax.saxutils import escape, quoteattr
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from .cli import Config
|
|
10
|
+
|
|
11
|
+
from .constants import (
|
|
12
|
+
TAG_DIRECTORY_STRUCTURE,
|
|
13
|
+
TAG_FILES,
|
|
14
|
+
TAG_FILE,
|
|
15
|
+
TAG_CONTENT,
|
|
16
|
+
SYSTEM_INSTRUCTIONS_MARKDOWN,
|
|
17
|
+
SYSTEM_INSTRUCTIONS_XML,
|
|
18
|
+
GENERATION_FLAG
|
|
19
|
+
)
|
|
20
|
+
from .utils import get_dynamic_wrapper
|
|
21
|
+
from .parsers import NotebookCellIR, TableIR, enforce_table_limit
|
|
22
|
+
|
|
23
|
+
class OutputGenerator(ABC):
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def generate(self,
|
|
26
|
+
project_name: str,
|
|
27
|
+
tree_text: str,
|
|
28
|
+
files_data: List[Dict[str, Any]],
|
|
29
|
+
stats: Dict[str, Any],
|
|
30
|
+
total_tokens: int,
|
|
31
|
+
token_method: str,
|
|
32
|
+
config: 'Config' = None) -> str:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
class MarkdownGenerator(OutputGenerator):
|
|
36
|
+
def generate(self,
|
|
37
|
+
project_name: str,
|
|
38
|
+
tree_text: str,
|
|
39
|
+
files_data: List[Dict[str, Any]],
|
|
40
|
+
stats: Dict[str, Any],
|
|
41
|
+
total_tokens: int,
|
|
42
|
+
token_method: str,
|
|
43
|
+
config: 'Config' = None) -> str:
|
|
44
|
+
|
|
45
|
+
timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')
|
|
46
|
+
method_label = "o200k_base" if token_method == "o200k_base" else "regex_fallback" if token_method == "regex_fallback" else "word_count"
|
|
47
|
+
|
|
48
|
+
lines = [
|
|
49
|
+
f"<!-- {GENERATION_FLAG} -->",
|
|
50
|
+
"",
|
|
51
|
+
f"# codebase: {project_name}",
|
|
52
|
+
"",
|
|
53
|
+
SYSTEM_INSTRUCTIONS_MARKDOWN,
|
|
54
|
+
"",
|
|
55
|
+
f"> Generated on: {timestamp}",
|
|
56
|
+
f"> Tokens: {total_tokens} (est. via {method_label})",
|
|
57
|
+
"",
|
|
58
|
+
"# Directory Structure",
|
|
59
|
+
"```text",
|
|
60
|
+
tree_text,
|
|
61
|
+
"```",
|
|
62
|
+
"",
|
|
63
|
+
"# Files",
|
|
64
|
+
"",
|
|
65
|
+
"This section contains the contents of the repository's files.",
|
|
66
|
+
""
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
for file_info in files_data:
|
|
70
|
+
rel_path = file_info['path']
|
|
71
|
+
# Normalize path to match directory structure (always use backslashes)
|
|
72
|
+
display_path = rel_path.replace(os.sep, '\\')
|
|
73
|
+
content = file_info['content']
|
|
74
|
+
ext = Path(rel_path).suffix.lower()
|
|
75
|
+
|
|
76
|
+
lines.append(f"## File: {display_path}")
|
|
77
|
+
|
|
78
|
+
if isinstance(content, list) and content and isinstance(content[0], NotebookCellIR):
|
|
79
|
+
# Render Notebook IR
|
|
80
|
+
for cell in content:
|
|
81
|
+
lines.append(f"### Cell {cell.number} ({cell.type}) - {display_path}")
|
|
82
|
+
wrapper = get_dynamic_wrapper(cell.source)
|
|
83
|
+
lang = 'python' if cell.type == 'code' else 'markdown'
|
|
84
|
+
lines.append(f"{wrapper}{lang}")
|
|
85
|
+
lines.append(cell.source)
|
|
86
|
+
lines.append(wrapper)
|
|
87
|
+
|
|
88
|
+
if cell.outputs:
|
|
89
|
+
lines.append("\n**Outputs:**")
|
|
90
|
+
lines.append("```text")
|
|
91
|
+
lines.append(cell.outputs)
|
|
92
|
+
lines.append("```")
|
|
93
|
+
lines.append("")
|
|
94
|
+
|
|
95
|
+
elif isinstance(content, list) and content and isinstance(content[0], TableIR):
|
|
96
|
+
# Render Table IR (CSV/Excel)
|
|
97
|
+
for table in content:
|
|
98
|
+
table_parts = []
|
|
99
|
+
|
|
100
|
+
# Handle Excel Sheet Metadata
|
|
101
|
+
if table.sheet_number is not None:
|
|
102
|
+
lines.append(f"### Sheet {table.sheet_number}: {table.name} - {table.file_path}")
|
|
103
|
+
|
|
104
|
+
if table.header_note:
|
|
105
|
+
table_parts.append(table.header_note)
|
|
106
|
+
|
|
107
|
+
if not table.df.empty:
|
|
108
|
+
table_parts.append(table.df.to_markdown(index=False))
|
|
109
|
+
|
|
110
|
+
if table.footer_note:
|
|
111
|
+
table_parts.append(table.footer_note)
|
|
112
|
+
|
|
113
|
+
table_text = "\n".join(table_parts)
|
|
114
|
+
if config:
|
|
115
|
+
table_text = enforce_table_limit(table_text, config.table_limit, config.table_truncate)
|
|
116
|
+
|
|
117
|
+
lines.append(table_text)
|
|
118
|
+
|
|
119
|
+
# Close Sheet block if applicable
|
|
120
|
+
if table.sheet_number is not None:
|
|
121
|
+
lines.append("---")
|
|
122
|
+
|
|
123
|
+
lines.append("")
|
|
124
|
+
|
|
125
|
+
else:
|
|
126
|
+
# Standard files or fallback string content
|
|
127
|
+
str_content = str(content)
|
|
128
|
+
wrapper = get_dynamic_wrapper(str_content)
|
|
129
|
+
lang = ext[1:] if ext and ext != '.md' else 'markdown' if ext == '.md' else 'text'
|
|
130
|
+
lines.append(f"{wrapper}{lang}")
|
|
131
|
+
lines.append(str_content)
|
|
132
|
+
lines.append(wrapper)
|
|
133
|
+
|
|
134
|
+
lines.append("")
|
|
135
|
+
|
|
136
|
+
return "\n".join(lines)
|
|
137
|
+
|
|
138
|
+
class XMLGenerator(OutputGenerator):
|
|
139
|
+
def generate(self,
|
|
140
|
+
project_name: str,
|
|
141
|
+
tree_text: str,
|
|
142
|
+
files_data: List[Dict[str, Any]],
|
|
143
|
+
stats: Dict[str, Any],
|
|
144
|
+
total_tokens: int,
|
|
145
|
+
token_method: str,
|
|
146
|
+
config: 'Config' = None) -> str:
|
|
147
|
+
|
|
148
|
+
timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')
|
|
149
|
+
method_label = "o200k_base" if token_method == "o200k_base" else "regex_fallback" if token_method == "regex_fallback" else "word_count"
|
|
150
|
+
|
|
151
|
+
lines = [
|
|
152
|
+
f"<!-- {GENERATION_FLAG} -->",
|
|
153
|
+
"",
|
|
154
|
+
f'<codebase name={quoteattr(project_name)}>',
|
|
155
|
+
"",
|
|
156
|
+
SYSTEM_INSTRUCTIONS_XML,
|
|
157
|
+
"",
|
|
158
|
+
"<metadata>",
|
|
159
|
+
f" <generated_on>{timestamp}</generated_on>",
|
|
160
|
+
f' <total_tokens method="{method_label}">{total_tokens}</total_tokens>',
|
|
161
|
+
"</metadata>",
|
|
162
|
+
"",
|
|
163
|
+
f"<{TAG_DIRECTORY_STRUCTURE}>",
|
|
164
|
+
escape(tree_text),
|
|
165
|
+
f"</{TAG_DIRECTORY_STRUCTURE}>",
|
|
166
|
+
"",
|
|
167
|
+
f"<{TAG_FILES}>",
|
|
168
|
+
"This section contains the contents of the repository's files.",
|
|
169
|
+
""
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
for file_info in files_data:
|
|
173
|
+
rel_path = file_info['path']
|
|
174
|
+
# Normalize path to match directory structure (always use backslashes)
|
|
175
|
+
display_path = rel_path.replace(os.sep, '\\')
|
|
176
|
+
content = file_info['content']
|
|
177
|
+
|
|
178
|
+
lines.append(f'<{TAG_FILE} path="{display_path}">')
|
|
179
|
+
|
|
180
|
+
if isinstance(content, list) and content and isinstance(content[0], NotebookCellIR):
|
|
181
|
+
# Render Notebook IR to XML
|
|
182
|
+
for cell in content:
|
|
183
|
+
lines.append(f' <cell path="{display_path}" index="{cell.number}" type="{cell.type}">')
|
|
184
|
+
lines.append(f' <{TAG_CONTENT}>')
|
|
185
|
+
lines.append(escape(cell.source))
|
|
186
|
+
lines.append(f' </{TAG_CONTENT}>')
|
|
187
|
+
if cell.outputs:
|
|
188
|
+
lines.append(' <outputs>')
|
|
189
|
+
lines.append(escape(cell.outputs))
|
|
190
|
+
lines.append(' </outputs>')
|
|
191
|
+
lines.append(' </cell>')
|
|
192
|
+
|
|
193
|
+
elif isinstance(content, list) and content and isinstance(content[0], TableIR):
|
|
194
|
+
# Render Table IR to XML
|
|
195
|
+
for table in content:
|
|
196
|
+
# Handle Excel Sheet Metadata
|
|
197
|
+
if table.sheet_number is not None:
|
|
198
|
+
lines.append(f'<sheet name="{table.name}" sheet_number="{table.sheet_number}" path="{table.file_path}">')
|
|
199
|
+
|
|
200
|
+
table_parts = []
|
|
201
|
+
if table.header_note:
|
|
202
|
+
table_parts.append(table.header_note)
|
|
203
|
+
|
|
204
|
+
if not table.df.empty:
|
|
205
|
+
table_parts.append(table.df.to_markdown(index=False))
|
|
206
|
+
|
|
207
|
+
if table.footer_note:
|
|
208
|
+
table_parts.append(table.footer_note)
|
|
209
|
+
|
|
210
|
+
table_text = "\n".join(table_parts)
|
|
211
|
+
if config:
|
|
212
|
+
table_text = enforce_table_limit(table_text, config.table_limit, config.table_truncate)
|
|
213
|
+
|
|
214
|
+
lines.append(escape(table_text))
|
|
215
|
+
|
|
216
|
+
# Close Sheet block if applicable
|
|
217
|
+
if table.sheet_number is not None:
|
|
218
|
+
lines.append('</sheet>')
|
|
219
|
+
|
|
220
|
+
else:
|
|
221
|
+
# Standard files or fallback string content
|
|
222
|
+
lines.append(str(content))
|
|
223
|
+
|
|
224
|
+
lines.append(f"</{TAG_FILE}>")
|
|
225
|
+
lines.append("")
|
|
226
|
+
|
|
227
|
+
lines.append(f"</{TAG_FILES}>")
|
|
228
|
+
lines.append("</codebase>")
|
|
229
|
+
|
|
230
|
+
return "\n".join(lines)
|
|
231
|
+
|
|
232
|
+
def get_generator(format_type: str) -> OutputGenerator:
|
|
233
|
+
if format_type.lower() == 'markdown':
|
|
234
|
+
return MarkdownGenerator()
|
|
235
|
+
return XMLGenerator()
|