code-to-txt 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,180 +1,329 @@
1
1
  import os
2
+ from fnmatch import fnmatch
2
3
  from pathlib import Path
3
- from typing import List, Optional, Set
4
+ from typing import Any
5
+
4
6
  import pathspec
7
+ from pathspec import PathSpec
5
8
 
9
+ from .utils import load_patterns_from_file
6
10
 
7
- class CodeToText:
8
- DEFAULT_IGNORE = {
9
- "__pycache__",
10
- "*.pyc",
11
- "*.pyo",
12
- "*.pyd"
13
- }
14
11
 
15
- DEFAULT_EXTENSIONS = {
16
- ".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".c", ".cpp", ".h"
17
- }
12
+ class CodeToText:
13
+ """Convert code files to a single text file for LLM consumption."""
18
14
 
19
15
  def __init__(
20
16
  self,
21
17
  root_path: str,
22
- output_file: str = "output.txt",
23
- include_extensions: Optional[Set[str]] = None,
24
- exclude_patterns: Optional[List[str]] = None,
18
+ output_file: str | None = "output.txt",
19
+ include_extensions: set[str] | None = None,
20
+ exclude_patterns: list[str] | None = None,
21
+ glob_patterns: list[str] | None = None,
25
22
  gitignore: bool = True,
23
+ max_file_size_kb: int | None = None,
26
24
  ):
27
25
  """
28
- Initialize the instance of CodeToText.
26
+ Initialize CodeToText instance.
29
27
 
30
28
  Args:
31
29
  root_path: Root directory to scan
32
- output_file: Output file path
30
+ output_file: Output file path (None for clipboard-only mode)
33
31
  include_extensions: Set of file extensions to include (with dots)
34
32
  exclude_patterns: List of patterns to exclude (gitignore style)
33
+ glob_patterns: List of glob patterns to include (e.g., '*.py', 'src/**/*.js')
35
34
  gitignore: Whether to respect .gitignore files
35
+ max_file_size_kb: Skip files larger than this size in KB
36
36
  """
37
37
  self.root_path = Path(root_path).resolve()
38
38
  self.output_file = output_file
39
- self.include_extensions = include_extensions or self.DEFAULT_EXTENSIONS
40
- self.exclude_patterns = exclude_patterns or []
39
+ self.glob_patterns = glob_patterns or []
41
40
  self.gitignore = gitignore
42
- self.spec = None
41
+ self.max_file_size_kb = max_file_size_kb
42
+ self.spec: PathSpec | None = None
43
+ self.file_count = 0
44
+ self.skipped_files: list[tuple[Path, str]] = []
45
+
46
+ config_dir = Path(__file__).parent
47
+ default_extensions = load_patterns_from_file(config_dir / ".extensions")
48
+ default_ignore = load_patterns_from_file(config_dir / ".ignore")
49
+
50
+ self.include_extensions = include_extensions or default_extensions
51
+ self.exclude_patterns = exclude_patterns or []
52
+ self.default_ignore = default_ignore
43
53
 
44
54
  if self.gitignore:
45
- self._load_gitignore()
55
+ self._init_pathspec()
46
56
 
47
- def _load_gitignore(self):
48
- """Load .gitignore patterns if present."""
49
- gitignore_path = self.root_path / ".gitignore"
50
- patterns = list(self.DEFAULT_IGNORE)
57
+ def _init_pathspec(self) -> None:
58
+ """Initialize pathspec from .gitignore files and default patterns."""
59
+ patterns = list(self.default_ignore)
60
+ current_path = self.root_path
51
61
 
52
- if gitignore_path.exists():
53
- with open(gitignore_path, "r") as f:
54
- for line in f:
55
- line = line.strip()
56
- if line and not line.startswith("#"):
57
- patterns.append(line)
62
+ for _ in range(5):
63
+ gitignore_path = current_path / ".gitignore"
64
+ if gitignore_path.exists():
65
+ try:
66
+ with open(gitignore_path, encoding="utf-8") as f:
67
+ for line in f:
68
+ clean_line = line.strip()
69
+ if clean_line and not clean_line.startswith("#"):
70
+ patterns.append(clean_line)
71
+ except Exception:
72
+ pass
73
+
74
+ parent = current_path.parent
75
+ if parent == current_path:
76
+ break
77
+ current_path = parent
58
78
 
59
79
  patterns.extend(self.exclude_patterns)
60
80
  self.spec = pathspec.PathSpec.from_lines("gitignore", patterns)
61
81
 
62
- def _should_include_file(self, file_path: Path) -> bool:
63
- """Check if a file should be included."""
64
- # Check extension
65
- if file_path.suffix not in self.include_extensions:
82
+ def _check_glob_match(self, file_path: Path) -> bool:
83
+ """Check if file matches any glob pattern."""
84
+ if not self.glob_patterns:
66
85
  return False
67
86
 
87
+ relative_path = file_path.relative_to(self.root_path)
88
+ path_str = str(relative_path)
89
+
90
+ for pattern in self.glob_patterns:
91
+ if fnmatch(path_str, pattern):
92
+ return True
93
+ if fnmatch(file_path.name, pattern):
94
+ return True
95
+ if fnmatch(path_str.replace(os.sep, "/"), pattern):
96
+ return True
97
+
98
+ return False
99
+
100
+ def _check_file_inclusion(self, file_path: Path) -> bool:
101
+ """Determine if a file should be included in the output."""
102
+ if self.max_file_size_kb is not None:
103
+ try:
104
+ file_size_kb = file_path.stat().st_size / 1024
105
+ if file_size_kb > self.max_file_size_kb:
106
+ self.skipped_files.append(
107
+ (file_path, f"exceeds size limit ({file_size_kb:.1f}KB)")
108
+ )
109
+ return False
110
+ except Exception:
111
+ pass
112
+
113
+ if self.glob_patterns:
114
+ if not self._check_glob_match(file_path):
115
+ return False
116
+
68
117
  if self.spec:
69
- relative_path = file_path.relative_to(self.root_path)
70
- if self.spec.match_file(str(relative_path)):
118
+ try:
119
+ relative_path = file_path.relative_to(self.root_path)
120
+ relative_str = str(relative_path).replace(os.sep, "/")
121
+
122
+ if self.spec.match_file(relative_str):
123
+ self.skipped_files.append((file_path, "matches ignore pattern"))
124
+ return False
125
+ except ValueError:
71
126
  return False
72
127
 
128
+ if file_path.suffix not in self.include_extensions:
129
+ return False
130
+
73
131
  return True
74
132
 
75
- def _get_files(self) -> List[Path]:
76
- """Get all files to process."""
133
+ def _collect_files(self) -> list[Path]:
134
+ """Collect all files to process based on filters."""
77
135
  files = []
136
+ self.skipped_files = []
137
+
78
138
  for root, dirs, filenames in os.walk(self.root_path):
79
139
  root_path = Path(root)
80
140
 
81
141
  if self.spec:
82
- relative_root = root_path.relative_to(self.root_path)
83
- dirs[:] = [
84
- d for d in dirs
85
- if not self.spec.match_file(str(relative_root / d))
86
- ]
142
+ try:
143
+ relative_root = root_path.relative_to(self.root_path)
144
+ root_str = str(relative_root).replace(os.sep, "/") if str(relative_root) != "." else ""
145
+
146
+ filtered_dirs = []
147
+ for d in dirs:
148
+ dir_path = f"{root_str}/{d}" if root_str else d
149
+
150
+ if not self.spec.match_file(dir_path) and not self.spec.match_file(f"{dir_path}/"):
151
+ filtered_dirs.append(d)
152
+
153
+ dirs[:] = filtered_dirs
154
+ except ValueError:
155
+ pass
87
156
 
88
157
  for filename in filenames:
89
158
  file_path = root_path / filename
90
- if self._should_include_file(file_path):
159
+ if self._check_file_inclusion(file_path):
91
160
  files.append(file_path)
92
161
 
93
162
  return sorted(files)
94
163
 
95
- def convert(self, add_tree: bool = True, separator: str = "=" * 80) -> int:
164
+ def calculate_statistics(self) -> dict[str, Any]:
96
165
  """
97
- Convert files to single text file.
166
+ Calculate statistics about the codebase.
167
+
168
+ Returns:
169
+ Dictionary containing total files, size, lines, breakdown by extension, etc.
170
+ """
171
+ files = self._collect_files()
172
+
173
+ stats: dict = {
174
+ "total_files": len(files),
175
+ "total_size_bytes": 0,
176
+ "total_lines": 0,
177
+ "by_extension": {},
178
+ "skipped_files": len(self.skipped_files),
179
+ "largest_files": [],
180
+ }
181
+
182
+ file_sizes = []
183
+
184
+ for file_path in files:
185
+ try:
186
+ size = file_path.stat().st_size
187
+ stats["total_size_bytes"] += size
188
+ file_sizes.append((file_path, size))
189
+
190
+ try:
191
+ with open(file_path, encoding="utf-8") as f:
192
+ lines = sum(1 for _ in f)
193
+ stats["total_lines"] += lines
194
+ except Exception:
195
+ pass
196
+
197
+ ext = file_path.suffix or "(no extension)"
198
+ if ext not in stats["by_extension"]:
199
+ stats["by_extension"][ext] = {"count": 0, "size": 0}
200
+ stats["by_extension"][ext]["count"] += 1
201
+ stats["by_extension"][ext]["size"] += size
202
+
203
+ except Exception:
204
+ pass
205
+
206
+ file_sizes.sort(key=lambda x: x[1], reverse=True)
207
+ stats["largest_files"] = [
208
+ {"path": str(f.relative_to(self.root_path)), "size_kb": s / 1024}
209
+ for f, s in file_sizes[:10]
210
+ ]
211
+
212
+ return stats
213
+
214
+ def generate_content(self, add_tree: bool = True, separator: str = "=" * 80) -> str:
215
+ """
216
+ Generate content as string without writing to file.
98
217
 
99
218
  Args:
100
219
  add_tree: Whether to add directory tree at the beginning
101
220
  separator: Separator between files
102
221
 
103
222
  Returns:
104
- Number of files processed
223
+ Generated content as string
105
224
  """
106
- files = self._get_files()
225
+ files = self._collect_files()
226
+ self.file_count = len(files)
227
+
228
+ output_lines = []
229
+ output_lines.append(f"Code Export from: {self.root_path}")
230
+ output_lines.append(f"Total files: {len(files)}")
231
+ output_lines.append(separator)
232
+ output_lines.append("")
233
+
234
+ if add_tree:
235
+ output_lines.append("DIRECTORY TREE:")
236
+ output_lines.append(separator)
237
+ output_lines.append(self._build_tree_structure())
238
+ output_lines.append("")
239
+ output_lines.append(separator)
240
+ output_lines.append("")
241
+
242
+ for idx, file_path in enumerate(files, 1):
243
+ relative_path = file_path.relative_to(self.root_path)
107
244
 
108
- with open(self.output_file, "w", encoding="utf-8") as out:
109
- out.write(f"Code Export from: {self.root_path}\n")
110
- out.write(f"Total files: {len(files)}\n")
111
- out.write(f"{separator}\n\n")
245
+ output_lines.append(f"FILE {idx}/{len(files)}: {relative_path}")
246
+ output_lines.append(separator)
112
247
 
113
- if add_tree:
114
- out.write("DIRECTORY TREE:\n")
115
- out.write(separator + "\n")
116
- out.write(self._generate_tree())
117
- out.write(f"\n{separator}\n\n")
248
+ try:
249
+ with open(file_path, encoding="utf-8") as f:
250
+ content = f.read()
251
+ output_lines.append(content)
252
+ except UnicodeDecodeError:
253
+ output_lines.append("[Binary file - skipped]")
254
+ except Exception as e:
255
+ output_lines.append(f"[Error reading file: {e}]")
118
256
 
119
- for i, file_path in enumerate(files, 1):
120
- relative_path = file_path.relative_to(self.root_path)
257
+ output_lines.append("")
258
+ output_lines.append(separator)
259
+ output_lines.append("")
121
260
 
122
- out.write(f"FILE {i}/{len(files)}: {relative_path}\n")
123
- out.write(separator + "\n")
261
+ return "\n".join(output_lines)
124
262
 
125
- try:
126
- with open(file_path, "r", encoding="utf-8") as f:
127
- content = f.read()
128
- out.write(content)
129
- except UnicodeDecodeError:
130
- out.write(f"[Binary file - skipped]\n")
131
- except Exception as e:
132
- out.write(f"[Error reading file: {e}]\n")
263
+ def convert(self, add_tree: bool = True, separator: str = "=" * 80) -> int:
264
+ """
265
+ Convert files to single text file.
266
+
267
+ Args:
268
+ add_tree: Whether to add directory tree at the beginning
269
+ separator: Separator between files
270
+
271
+ Returns:
272
+ Number of files processed
273
+ """
274
+ if not self.output_file:
275
+ raise ValueError("output_file must be specified for convert()")
276
+
277
+ content = self.generate_content(add_tree=add_tree, separator=separator)
278
+
279
+ with open(self.output_file, "w", encoding="utf-8") as out:
280
+ out.write(content)
133
281
 
134
- out.write(f"\n{separator}\n\n")
282
+ return self.file_count
135
283
 
136
- return len(files)
284
+ def _build_tree_structure(self) -> str:
285
+ """Build a directory tree representation of included files."""
286
+ tree_output = []
287
+ files = self._collect_files()
137
288
 
138
- def _generate_tree(self) -> str:
139
- """Generate a directory tree representation."""
140
- tree_lines = []
141
- files = self._get_files()
289
+ if not files:
290
+ return "(no files to display)"
142
291
 
143
- dir_structure = {}
292
+ structure: dict[str, Any] = {}
144
293
  for file_path in files:
145
294
  relative_path = file_path.relative_to(self.root_path)
146
295
  parts = relative_path.parts
147
296
 
148
- current = dir_structure
297
+ current_level = structure
149
298
  for part in parts[:-1]:
150
- if part not in current:
151
- current[part] = {}
152
- current = current[part]
299
+ if part not in current_level:
300
+ current_level[part] = {}
301
+ current_level = current_level[part]
153
302
 
154
- if "__files__" not in current:
155
- current["__files__"] = []
156
- current["__files__"].append(parts[-1])
303
+ if "__files__" not in current_level:
304
+ current_level["__files__"] = []
305
+ current_level["__files__"].append(parts[-1])
157
306
 
158
- def print_tree(structure, prefix="", is_last=True):
159
- items = []
160
- for key in sorted(structure.keys()):
307
+ def render_tree(node: dict[str, Any], prefix: str = "", is_final: bool = True) -> None:
308
+ entries = []
309
+ for key in sorted(node.keys()):
161
310
  if key != "__files__":
162
- items.append((key, True)) # directory
311
+ entries.append((key, True))
163
312
 
164
- if "__files__" in structure:
165
- for file in sorted(structure["__files__"]):
166
- items.append((file, False)) # file
313
+ if "__files__" in node:
314
+ for file in sorted(node["__files__"]):
315
+ entries.append((file, False))
167
316
 
168
- for i, (name, is_dir) in enumerate(items):
169
- is_last_item = i == len(items) - 1
170
- connector = "└── " if is_last_item else "├── "
171
- tree_lines.append(f"{prefix}{connector}{name}{'/' if is_dir else ''}")
317
+ for i, (name, is_directory) in enumerate(entries):
318
+ is_last_entry = i == len(entries) - 1
319
+ connector = "└── " if is_last_entry else "├── "
320
+ tree_output.append(f"{prefix}{connector}{name}{'/' if is_directory else ''}")
172
321
 
173
- if is_dir:
174
- extension = " " if is_last_item else "│ "
175
- print_tree(structure[name], prefix + extension, is_last_item)
322
+ if is_directory:
323
+ extension = " " if is_last_entry else "│ "
324
+ render_tree(node[name], prefix + extension, is_last_entry)
176
325
 
177
- tree_lines.append(f"{self.root_path.name}/")
178
- print_tree(dir_structure)
326
+ tree_output.append(f"{self.root_path.name}/")
327
+ render_tree(structure)
179
328
 
180
- return "\n".join(tree_lines)
329
+ return "\n".join(tree_output)
code_to_txt/config.py ADDED
@@ -0,0 +1,167 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ import yaml
5
+
6
+ DEFAULT_CONFIG = {
7
+ "output": "code-to-txt.txt",
8
+ "extensions": None,
9
+ "exclude": [
10
+ "tests/*",
11
+ "*.test.js",
12
+ "*.test.ts",
13
+ "*.spec.js",
14
+ "*.spec.ts",
15
+ ],
16
+ "glob": [],
17
+ "no_gitignore": False,
18
+ "no_tree": False,
19
+ "separator": "=" * 80,
20
+ "clipboard": False,
21
+ "clipboard_only": False,
22
+ "timestamp": True,
23
+ "max_file_size": None,
24
+ }
25
+
26
+
27
+ def load_config(config_path: str) -> dict[str, Any]:
28
+ """
29
+ Load configuration from YAML file.
30
+
31
+ Args:
32
+ config_path: Path to the configuration file
33
+
34
+ Returns:
35
+ Dictionary with validated configuration values
36
+ """
37
+ path = Path(config_path)
38
+
39
+ if not path.exists():
40
+ raise FileNotFoundError(f"Config file not found: {config_path}")
41
+
42
+ with open(path, encoding="utf-8") as f:
43
+ config = yaml.safe_load(f)
44
+
45
+ if config is None:
46
+ config = {}
47
+
48
+ validated_config: dict[str, Any] = {}
49
+
50
+ if "output" in config:
51
+ validated_config["output"] = str(config["output"])
52
+
53
+ if "separator" in config:
54
+ validated_config["separator"] = str(config["separator"])
55
+
56
+ if "extensions" in config:
57
+ ext = config["extensions"]
58
+ if isinstance(ext, str):
59
+ validated_config["extensions"] = ext
60
+ elif isinstance(ext, list):
61
+ validated_config["extensions"] = " ".join(str(e) for e in ext)
62
+ elif ext is not None:
63
+ validated_config["extensions"] = str(ext)
64
+
65
+ for field in ["exclude", "glob"]:
66
+ if field in config:
67
+ value = config[field]
68
+ if isinstance(value, list):
69
+ validated_config[field] = value
70
+ elif isinstance(value, str):
71
+ validated_config[field] = [value]
72
+ elif value is not None:
73
+ validated_config[field] = [str(value)]
74
+
75
+ for field in ["no_gitignore", "no_tree", "clipboard", "clipboard_only", "timestamp"]:
76
+ if field in config:
77
+ validated_config[field] = bool(config[field])
78
+
79
+ if "max_file_size" in config and config["max_file_size"] is not None:
80
+ validated_config["max_file_size"] = int(config["max_file_size"])
81
+
82
+ return validated_config
83
+
84
+
85
+ def create_default_config(config_path: Path) -> None:
86
+ """
87
+ Create a default configuration file.
88
+
89
+ Args:
90
+ config_path: Path where to create the config file
91
+ """
92
+ config_content = """# Code-to-Txt Configuration File
93
+ # This file defines default settings for code-to-txt
94
+ # CLI arguments will override these settings
95
+
96
+ # Output file name (supports strftime formatting)
97
+ # Use timestamp: true to automatically add timestamp
98
+ output: code-to-txt.txt
99
+
100
+ # File extensions to include
101
+ # Can be a list or space/comma-separated string
102
+ # Leave as null to use default extensions
103
+ # extensions: [.py, .js, .ts]
104
+ # extensions: ".py .js .ts"
105
+ extensions: null
106
+
107
+ # Patterns to exclude (gitignore-style)
108
+ # These are in addition to .gitignore patterns
109
+ exclude:
110
+ - "tests/*"
111
+ - "*.test.js"
112
+ - "*.test.ts"
113
+ - "*.spec.js"
114
+ - "*.spec.ts"
115
+ - "node_modules/*"
116
+ - "__pycache__/*"
117
+ - "*.pyc"
118
+
119
+ # Glob patterns to include (alternative to extensions)
120
+ # If specified, only files matching these patterns will be included
121
+ # glob:
122
+ # - "*.py"
123
+ # - "src/**/*.js"
124
+ # - "**/*.tsx"
125
+ glob: []
126
+
127
+ # Ignore .gitignore files
128
+ no_gitignore: false
129
+
130
+ # Don't include directory tree in output
131
+ no_tree: false
132
+
133
+ # Separator between files
134
+ separator: "================================================================================"
135
+
136
+ # Copy output to clipboard
137
+ clipboard: false
138
+
139
+ # Copy to clipboard only (don't save file)
140
+ clipboard_only: false
141
+
142
+ # Add timestamp to output filename
143
+ timestamp: true
144
+
145
+ max_file_size: null
146
+
147
+ # Example configurations:
148
+ #
149
+ # For Python projects:
150
+ # extensions: [.py]
151
+ # exclude: ["tests/*", "*.pyc", "__pycache__/*", "venv/*"]
152
+ #
153
+ # For JavaScript/TypeScript projects:
154
+ # extensions: [.js, .ts, .jsx, .tsx]
155
+ # exclude: ["node_modules/*", "dist/*", "build/*", "*.test.js"]
156
+ #
157
+ # For C/C++ projects:
158
+ # extensions: [.c, .cpp, .h, .hpp]
159
+ # exclude: ["build/*", "*.o", "*.a"]
160
+ #
161
+ # Using glob patterns:
162
+ # glob: ["src/**/*.py", "lib/**/*.py", "*.md"]
163
+ # extensions: null
164
+ """
165
+
166
+ with open(config_path, "w", encoding="utf-8") as f:
167
+ f.write(config_content)
code_to_txt/utils.py ADDED
@@ -0,0 +1,13 @@
1
+ from pathlib import Path
2
+
3
+
4
+ def load_patterns_from_file(file_path: Path) -> set[str]:
5
+ """Load patterns from a text file, one per line."""
6
+ patterns = set()
7
+ if file_path.exists():
8
+ with open(file_path, encoding="utf-8") as f:
9
+ for line in f:
10
+ stripped = line.strip()
11
+ if stripped and not stripped.startswith("#"):
12
+ patterns.add(stripped)
13
+ return patterns