merger-cli 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Diogo Losacco Toporcov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+
5
+ recursive-include merger *.py
6
+ recursive-include examples *.py
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.4
2
+ Name: merger-cli
3
+ Version: 1.0.0
4
+ Summary: Merger is a tool that scans a directory, filters files using customizable patterns, and merges readable content into a single output file.
5
+ Author-email: Diogo Toporcov <diogotoporcov@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/diogotoporcov/merger-cli
8
+ Project-URL: Documentation, https://github.com/diogotoporcov/merger-cli
9
+ Keywords: merger,file system,concatenation,automation,development
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: chardet>=5.2.0
19
+ Dynamic: license-file
20
+
21
+ # Merger CLI
22
+
23
+ [![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
24
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
25
+
26
+ Merger is a command-line utility for developers that scans a directory, filters files using customizable ignore patterns, and merges all readable content into a single structured output file. It supports custom file readers and validators, making it easily extendable for formats such as `.ipynb`, `.pdf`, or any specific format.
27
+
28
+ ---
29
+
30
+ ## Summary
31
+
32
+ 1. [Core Features](#core-features)
33
+ 2. [Dependencies](#dependencies)
34
+ 3. [Installation](#installation)
35
+ 4. [Usage](#usage)
36
+ 5. [Custom Readers](#custom-readers)
37
+ 6. [CLI Options](#cli-options)
38
+ 7. [License](#license)
39
+
40
+ ---
41
+
42
+ ## Core Features
43
+
44
+ * **Recursive merge** of all readable text files under a root directory.
45
+ * **Glob-based ignore patterns** using `.gitignore`-style syntax.
46
+ * **Automatic encoding detection**.
47
+ * **Custom file readers and validators** for non-text formats.
48
+ * **CLI support** for installation, removal, and listing of custom readers.
49
+ * **Human-readable merged output**, including a directory tree header and file delimiters.
50
+
51
+ ---
52
+
53
+ ## Dependencies
54
+
55
+ | Component | Version / Type | Notes |
56
+ |-------------|----------------|-----------------------------|
57
+ | **Python** | ≥ 3.8 | Required |
58
+
59
+ All dependencies are listed in [`requirements.txt`](requirements.txt).
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ ### 1. Clone the repository
66
+
67
+ ```bash
68
+ git clone https://github.com/diogotoporcov/merger-cli.git
69
+ cd merger-cli
70
+ ```
71
+
72
+ ### 2. Create and activate a virtual environment
73
+
74
+ **Linux / macOS**
75
+
76
+ ```bash
77
+ python -m venv .venv
78
+ source .venv/bin/activate
79
+ ```
80
+
81
+ **Windows (PowerShell)**
82
+
83
+ ```powershell
84
+ python -m venv .venv
85
+ .venv\Scripts\Activate.ps1
86
+ ```
87
+
88
+ ### 3. Install dependencies
89
+
90
+ ```bash
91
+ pip install -r requirements.txt
92
+ ```
93
+
94
+ ### 4. Install as CLI tool
95
+
96
+ ```bash
97
+ pip install .
98
+ ```
99
+
100
+ This registers the `merger` command globally.
101
+
102
+ ---
103
+
104
+ ## Usage
105
+
106
+ ### Basic merge
107
+
108
+ ```bash
109
+ merger ./src ./merged.txt
110
+ ```
111
+
112
+ ### Custom ignore patterns
113
+
114
+ ```bash
115
+ merger "C:\Users\USER\Desktop\project" "C:\Users\USER\Desktop\project\output.txt" --ignore "*.log" "__pycache__" "*.tmp"
116
+ ```
117
+
118
+ ### Custom ignore file
119
+
120
+ ```bash
121
+ merger . ./output.txt -p ./merger.ignore
122
+ ```
123
+
124
+ ### Include empty files
125
+
126
+ ```bash
127
+ merger ./data ./output.txt --empty
128
+ ```
129
+
130
+ ### Verbose output
131
+
132
+ ```bash
133
+ merger ./src ./merged.txt --log-level DEBUG
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Custom Readers
139
+
140
+ You can extend Merger to handle new file formats.
141
+
142
+ ### Installing a custom reader
143
+
144
+ ```bash
145
+ merger --install .ipynb path/to/ipynb.py
146
+ ```
147
+
148
+ Where `ipynb.py` must define:
149
+
150
+ * ```python
151
+ validator: Callable[[Path], bool]
152
+ ```
153
+ * ```python
154
+ reader: Callable[[Path], str]
155
+ ```
156
+
157
+ To uninstall:
158
+
159
+ ```bash
160
+ merger --uninstall .ipynb
161
+ ```
162
+
163
+ List installed readers:
164
+
165
+ ```bash
166
+ merger --list-installed
167
+ ```
168
+
169
+ An example `.ipynb` reader can be found in
170
+ [`examples/custom_readers/ipynb.py`](examples/custom_readers/ipynb.py).
171
+
172
+ ---
173
+
174
+ ## CLI Options
175
+
176
+ | Option | Description |
177
+ |-------------------------|--------------------------------------------------------------------------------|
178
+ | `--ignore` | List of glob-style ignore patterns. |
179
+ | `-f, --ignore-file` | Path to file containing ignore patterns (Default: `<input_dir>/merger.ignore`. |
180
+ | `-i, --install` | Install a custom reader for an extension. |
181
+ | `-u, --uninstall` | Remove a custom reader (`*` removes all). |
182
+ | `--list-installed` | Show installed readers. |
183
+ | `-l, --log-level` | Set logging verbosity (`DEBUG`, `INFO`, etc.). |
184
+ | `--empty` | Include empty files in merged output. |
185
+ | `--prefix` / `--suffix` | Customize file delimiters in output. |
186
+ | `--overrides` | Load override reader definitions from a Python module. |
187
+ | `--no-tree` | Do not include the generated directory tree in the output file. |
188
+
189
+
190
+ ---
191
+
192
+ ## License
193
+
194
+ This project is licensed under the MIT License — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,174 @@
1
+ # Merger CLI
2
+
3
+ [![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
5
+
6
+ Merger is a command-line utility for developers that scans a directory, filters files using customizable ignore patterns, and merges all readable content into a single structured output file. It supports custom file readers and validators, making it easily extendable for formats such as `.ipynb`, `.pdf`, or any specific format.
7
+
8
+ ---
9
+
10
+ ## Summary
11
+
12
+ 1. [Core Features](#core-features)
13
+ 2. [Dependencies](#dependencies)
14
+ 3. [Installation](#installation)
15
+ 4. [Usage](#usage)
16
+ 5. [Custom Readers](#custom-readers)
17
+ 6. [CLI Options](#cli-options)
18
+ 7. [License](#license)
19
+
20
+ ---
21
+
22
+ ## Core Features
23
+
24
+ * **Recursive merge** of all readable text files under a root directory.
25
+ * **Glob-based ignore patterns** using `.gitignore`-style syntax.
26
+ * **Automatic encoding detection**.
27
+ * **Custom file readers and validators** for non-text formats.
28
+ * **CLI support** for installation, removal, and listing of custom readers.
29
+ * **Human-readable merged output**, including a directory tree header and file delimiters.
30
+
31
+ ---
32
+
33
+ ## Dependencies
34
+
35
+ | Component | Version / Type | Notes |
36
+ |-------------|----------------|-----------------------------|
37
+ | **Python** | ≥ 3.8 | Required |
38
+
39
+ All dependencies are listed in [`requirements.txt`](requirements.txt).
40
+
41
+ ---
42
+
43
+ ## Installation
44
+
45
+ ### 1. Clone the repository
46
+
47
+ ```bash
48
+ git clone https://github.com/diogotoporcov/merger-cli.git
49
+ cd merger-cli
50
+ ```
51
+
52
+ ### 2. Create and activate a virtual environment
53
+
54
+ **Linux / macOS**
55
+
56
+ ```bash
57
+ python -m venv .venv
58
+ source .venv/bin/activate
59
+ ```
60
+
61
+ **Windows (PowerShell)**
62
+
63
+ ```powershell
64
+ python -m venv .venv
65
+ .venv\Scripts\Activate.ps1
66
+ ```
67
+
68
+ ### 3. Install dependencies
69
+
70
+ ```bash
71
+ pip install -r requirements.txt
72
+ ```
73
+
74
+ ### 4. Install as CLI tool
75
+
76
+ ```bash
77
+ pip install .
78
+ ```
79
+
80
+ This registers the `merger` command globally.
81
+
82
+ ---
83
+
84
+ ## Usage
85
+
86
+ ### Basic merge
87
+
88
+ ```bash
89
+ merger ./src ./merged.txt
90
+ ```
91
+
92
+ ### Custom ignore patterns
93
+
94
+ ```bash
95
+ merger "C:\Users\USER\Desktop\project" "C:\Users\USER\Desktop\project\output.txt" --ignore "*.log" "__pycache__" "*.tmp"
96
+ ```
97
+
98
+ ### Custom ignore file
99
+
100
+ ```bash
101
+ merger . ./output.txt -p ./merger.ignore
102
+ ```
103
+
104
+ ### Include empty files
105
+
106
+ ```bash
107
+ merger ./data ./output.txt --empty
108
+ ```
109
+
110
+ ### Verbose output
111
+
112
+ ```bash
113
+ merger ./src ./merged.txt --log-level DEBUG
114
+ ```
115
+
116
+ ---
117
+
118
+ ## Custom Readers
119
+
120
+ You can extend Merger to handle new file formats.
121
+
122
+ ### Installing a custom reader
123
+
124
+ ```bash
125
+ merger --install .ipynb path/to/ipynb.py
126
+ ```
127
+
128
+ Where `ipynb.py` must define:
129
+
130
+ * ```python
131
+ validator: Callable[[Path], bool]
132
+ ```
133
+ * ```python
134
+ reader: Callable[[Path], str]
135
+ ```
136
+
137
+ To uninstall:
138
+
139
+ ```bash
140
+ merger --uninstall .ipynb
141
+ ```
142
+
143
+ List installed readers:
144
+
145
+ ```bash
146
+ merger --list-installed
147
+ ```
148
+
149
+ An example `.ipynb` reader can be found in
150
+ [`examples/custom_readers/ipynb.py`](examples/custom_readers/ipynb.py).
151
+
152
+ ---
153
+
154
+ ## CLI Options
155
+
156
+ | Option | Description |
157
+ |-------------------------|--------------------------------------------------------------------------------|
158
+ | `--ignore` | List of glob-style ignore patterns. |
159
+ | `-f, --ignore-file` | Path to file containing ignore patterns (Default: `<input_dir>/merger.ignore`. |
160
+ | `-i, --install` | Install a custom reader for an extension. |
161
+ | `-u, --uninstall` | Remove a custom reader (`*` removes all). |
162
+ | `--list-installed` | Show installed readers. |
163
+ | `-l, --log-level` | Set logging verbosity (`DEBUG`, `INFO`, etc.). |
164
+ | `--empty` | Include empty files in merged output. |
165
+ | `--prefix` / `--suffix` | Customize file delimiters in output. |
166
+ | `--overrides` | Load override reader definitions from a Python module. |
167
+ | `--no-tree` | Do not include the generated directory tree in the output file. |
168
+
169
+
170
+ ---
171
+
172
+ ## License
173
+
174
+ This project is licensed under the MIT License — see [LICENSE](LICENSE) for details.
File without changes
File without changes
@@ -0,0 +1,82 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Final, Callable
4
+
5
+ import chardet
6
+
7
+
8
+ def is_ipynb_file(file_path: Path) -> bool:
9
+ """
10
+ Checks whether the given file is a valid Jupyter Notebook (.ipynb) file.
11
+
12
+ Args:
13
+ file_path (Path): Path to the file.
14
+
15
+ Returns:
16
+ bool: True if the file is a readable notebook with at least one cell, False otherwise.
17
+ """
18
+ try:
19
+ if not file_path.suffix == ".ipynb":
20
+ return False
21
+
22
+ with open(file_path, "rb") as file:
23
+ chunk = file.read(1024)
24
+
25
+ result = chardet.detect(chunk)
26
+ encoding = result.get("encoding")
27
+ confidence = result.get("confidence", 0)
28
+
29
+ if not encoding or confidence < 0.8:
30
+ encoding = "utf-8"
31
+
32
+ with file_path.open(encoding=encoding) as f:
33
+ notebook = json.load(f)
34
+
35
+ return isinstance(notebook, dict) and "cells" in notebook and isinstance(notebook["cells"], list)
36
+
37
+ except Exception:
38
+ return False
39
+
40
+
41
+ def extract_ipynb_content(file_path: Path, include_markdown: bool = True, include_code: bool = True) -> str:
42
+ """
43
+ Extracts code and markdown content from a Jupyter Notebook (.ipynb) file.
44
+
45
+ Args:
46
+ file_path (Path): Path to the notebook file.
47
+ include_markdown (bool): Whether to include markdown cells.
48
+ include_code (bool): Whether to include code cells.
49
+
50
+ Returns:
51
+ str: Extracted content with cells separated by double newlines.
52
+ """
53
+ result = []
54
+
55
+ with file_path.open(encoding="utf-8") as f:
56
+ notebook = json.load(f)
57
+
58
+ for cell in notebook.get("cells", []):
59
+ cell_type = cell.get("cell_type")
60
+ lines = cell.get("source", [])
61
+
62
+ if isinstance(lines, str):
63
+ lines = lines.splitlines()
64
+
65
+ if cell_type == "code" and include_code:
66
+ block = [line.rstrip() for line in lines]
67
+ if block:
68
+ result.append("\n".join(block))
69
+
70
+ elif cell_type == "markdown" and include_markdown:
71
+ block = [line.rstrip() for line in lines]
72
+ if block:
73
+ result.append("```markdown\n" + "\n".join(block) + "\n```")
74
+
75
+ return "\n\n".join(result)
76
+
77
+
78
+ validator: Final[Callable[[Path], bool]] = is_ipynb_file
79
+ reader: Final[Callable[[Path], str]] = extract_ipynb_content
80
+
81
+
82
+ __all__ = ["validator", "reader"]
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ from merger.files import merge
4
+ import examples.custom_readers.ipynb as ipynb
5
+
6
+ if __name__ == "__main__":
7
+ root = Path("path/to/dir")
8
+ ignore_patterns = [
9
+ "README.md",
10
+ ".idea",
11
+ "__pycache__",
12
+ ".env",
13
+ "./example/path", # File or folder named 'path' relative ./example/, where '.' is the root dir
14
+ "C:/Users/User/Desktop/path/to/dir/2",
15
+ "output.txt",
16
+ ".venv",
17
+ "*.docx", # Any file with extension .docx
18
+ "*cache*", # Any file of folder that contains 'cache' in its name or path
19
+ "__*__" # Any file or folder that starts with '__' and ends with '__'
20
+ ]
21
+ output_path = Path("./output.txt")
22
+
23
+ merge(
24
+ root,
25
+ ignore_patterns,
26
+ output_path,
27
+ validation_func_override={".ipynb": ipynb.validator},
28
+ read_func_override={".ipynb": ipynb.reader}
29
+ )
File without changes
@@ -0,0 +1,143 @@
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ from .files import merge, read_ignore_file
6
+ from .logger import logger, setup_logger
7
+ from .registry import register_reader, unregister_reader, list_readers, load_installed_readers, load_custom_readers
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser(
12
+ description="Merge readable files in a directory with support for ignore patterns and custom file readers."
13
+ )
14
+
15
+ # Required positional args
16
+ parser.add_argument("input_dir", type=Path, nargs="?", help="Root directory to scan for files")
17
+ parser.add_argument("output_file", type=Path, nargs="?",
18
+ help="File to save merged output (default: <input_dir>/merger.txt)")
19
+
20
+ # Reader management
21
+ parser.add_argument("-i", "--install", nargs=2, metavar=("EXT", "MODULE_PATH"),
22
+ help="Install a custom reader for a given extension (e.g., .pdf)")
23
+
24
+ parser.add_argument("-u", "--uninstall", metavar="EXT",
25
+ help="Uninstall a custom reader by extension (use '*' to remove all)")
26
+
27
+ parser.add_argument("--list-installed", action="store_true",
28
+ help="List all installed custom readers")
29
+
30
+ # Logging
31
+ parser.add_argument(
32
+ "-l", "--log-level",
33
+ type=str,
34
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
35
+ default="INFO",
36
+ help="Set the logging level (default: INFO)"
37
+ )
38
+
39
+ # Ignore logic
40
+ parser.add_argument("--ignore", nargs="*", default=[],
41
+ help="Glob-style patterns to ignore (e.g., '*.log', '__pycache__')")
42
+
43
+ parser.add_argument(
44
+ "-f", "--ignore-file", type=Path,
45
+ help="File containing glob-style patterns to ignore (default: <input_dir>/merger.ignore)")
46
+
47
+ # Custom reader overrides
48
+ parser.add_argument("--overrides", type=Path,
49
+ help="Path to Python module with `validators` and `readers` dictionaries for custom overrides")
50
+
51
+ parser.add_argument("--empty", action="store_true", default=False,
52
+ help="Include empty files in the merged output")
53
+
54
+ parser.add_argument("--prefix", type=str, default="<<FILE_START: {path}>>\n",
55
+ help="Format string for file start marker (set empty string to disable)")
56
+
57
+ parser.add_argument("--suffix", type=str, default="\n<<FILE_END: {path}>>\n\n",
58
+ help="Format string for file end marker (set empty string to disable)")
59
+
60
+ # Tree
61
+ parser.add_argument(
62
+ "--no-tree",
63
+ action="store_true",
64
+ default=False,
65
+ help="Do not include the generated directory tree in the output file"
66
+ )
67
+
68
+ # CLI Logic
69
+ args = parser.parse_args()
70
+
71
+ setup_logger(level=getattr(logging, args.log_level.upper()))
72
+
73
+ if args.install:
74
+ ext, path = args.install
75
+ register_reader(ext, path)
76
+ logger.info(f"Installed reader for '{ext}' from '{path}'")
77
+ return
78
+
79
+ if args.uninstall:
80
+ if args.uninstall == "*":
81
+ installed = list_readers()
82
+ if not installed:
83
+ logger.info("No custom readers to uninstall.")
84
+ else:
85
+ for ext in list(installed.keys()):
86
+ unregister_reader(ext)
87
+ logger.info(f"Uninstalled reader for '{ext}'")
88
+ else:
89
+ unregister_reader(args.uninstall)
90
+ logger.info(f"Uninstalled reader for '{args.uninstall}'")
91
+ return
92
+
93
+ if args.list_installed:
94
+ installed = list_readers()
95
+ if not installed:
96
+ logger.info("No custom readers installed.")
97
+ else:
98
+ logger.info("Installed Custom Readers:")
99
+ for ext, mod_path in installed.items():
100
+ logger.info(f" {ext}: {mod_path}")
101
+ return
102
+
103
+ # Handle default output file
104
+ if not args.input_dir:
105
+ parser.error("input_dir is required unless installing/uninstalling/listing readers.")
106
+
107
+ if not args.output_file:
108
+ args.output_file = args.input_dir / "merger.txt"
109
+
110
+ if not args.ignore_file and args.input_dir:
111
+ default_ignore = args.input_dir / "merger.ignore"
112
+ if default_ignore.exists():
113
+ args.ignore_file = default_ignore
114
+ logger.info("Found default ignore file 'merger.ignore' in input directory. Using it for ignore patterns.")
115
+
116
+ ignore_patterns = args.ignore.copy()
117
+ if args.ignore_file:
118
+ ignore_patterns.extend(read_ignore_file(args.ignore_file))
119
+
120
+ readers, validators = load_installed_readers()
121
+
122
+ if args.overrides:
123
+ r, v = load_custom_readers(args.overrides)
124
+ readers.update(r)
125
+ validators.update(v)
126
+
127
+ merge(
128
+ dir_path=args.input_dir,
129
+ ignore_patterns=ignore_patterns,
130
+ output_path=args.output_file,
131
+ validation_func_override=validators,
132
+ read_func_override=readers,
133
+ write_if_empty=args.empty,
134
+ prefix_format=args.prefix,
135
+ suffix_format=args.suffix,
136
+ include_tree=not args.no_tree
137
+ )
138
+
139
+ logger.info(f"Saved to {args.output_file}")
140
+
141
+
142
+ if __name__ == "__main__":
143
+ main()
@@ -0,0 +1,139 @@
1
+ from pathlib import Path
2
+ from typing import Callable, Dict, Optional, List
3
+
4
+ import chardet
5
+
6
+ from .filtering import filter_files_by_patterns
7
+ from .tree import generate_tree_visualizer
8
+ from .logger import logger
9
+
10
+
11
+ def is_text_file(
12
+ file_path: Path,
13
+ chunk_size: int = 1024,
14
+ *,
15
+ validation_func_override: Optional[Dict[str, Callable[[Path], bool]]] = None,
16
+ min_encoding_detection_confidence: float = 0.8
17
+ ) -> bool:
18
+ logger.debug(f"Checking if file is text: {file_path}")
19
+ try:
20
+ if validation_func_override and callable(validation_override_func := validation_func_override.get(file_path.suffix)):
21
+ result = validation_override_func(file_path)
22
+ logger.debug(f"Custom validator used for {file_path.suffix}: {result}")
23
+ return result
24
+
25
+ with open(file_path, "rb") as file:
26
+ chunk = file.read(chunk_size)
27
+
28
+ result = chardet.detect(chunk)
29
+ encoding = result.get("encoding")
30
+ confidence = result.get("confidence", 0)
31
+ logger.debug(f"Detected encoding for {file_path}: {encoding} (confidence: {confidence})")
32
+
33
+ if not encoding or confidence < min_encoding_detection_confidence:
34
+ encoding = "utf-8"
35
+ logger.debug(f"Low confidence for {file_path}. Falling back to utf-8")
36
+
37
+ chunk.decode(encoding)
38
+ return True
39
+
40
+ except Exception as e:
41
+ logger.error(f"Failed to decode {file_path}: {e}")
42
+ return False
43
+
44
+
45
+ def append_content(
46
+ root: Path,
47
+ file_path: Path,
48
+ output_path: Path,
49
+ *,
50
+ prefix: Optional[str] = "<<FILE_START: {path}>>\n",
51
+ suffix: Optional[str] = "\n<<FILE_END: {path}>>\n\n",
52
+ read_func_override: Optional[Dict[str, Callable[[Path], str]]] = None,
53
+ write_if_empty: bool = False
54
+ ) -> None:
55
+ relative_path = ".\\" + str(file_path.relative_to(root))
56
+ formatted_prefix = prefix.format(path=relative_path) if prefix else ""
57
+ formatted_suffix = suffix.format(path=relative_path) if suffix else ""
58
+
59
+ try:
60
+ if read_func_override and callable(read_override_func := read_func_override.get(file_path.suffix)):
61
+ content = read_override_func(file_path)
62
+ logger.debug(f"Used custom reader for {file_path}")
63
+ else:
64
+ content = file_path.read_text(encoding="utf-8")
65
+ logger.debug(f"Read text from {file_path} using utf-8")
66
+
67
+ if not write_if_empty and not content:
68
+ logger.debug(f"Skipping empty file: {file_path}")
69
+ return
70
+
71
+ with output_path.open("a", encoding="utf-8") as f:
72
+ f.write(formatted_prefix + content + formatted_suffix)
73
+ logger.debug(f"Merged: {file_path}")
74
+
75
+ except Exception as e:
76
+ logger.error(f"Failed to append content from {file_path}: {e}")
77
+
78
+
79
+ def merge(
80
+ dir_path: Path,
81
+ ignore_patterns: List[str],
82
+ output_path: Path,
83
+ *,
84
+ read_func_override: Optional[Dict[str, Callable[[Path], str]]] = None,
85
+ validation_func_override: Optional[Dict[str, Callable[[Path], bool]]] = None,
86
+ min_encoding_detection_confidence: float = 0.8,
87
+ write_if_empty: bool = False,
88
+ prefix_format: Optional[str] = "<<FILE_START: {path}>>\n",
89
+ suffix_format: Optional[str] = "\n<<FILE_END: {path}>>\n\n",
90
+ include_tree: bool = True
91
+ ) -> None:
92
+ logger.debug(f"Starting merge from: {dir_path}")
93
+ paths = filter_files_by_patterns(dir_path, ignore_patterns, True)
94
+ paths = [path for path in paths if path.resolve() != output_path.resolve()]
95
+ logger.debug(f"Filtered paths: {len(paths)} files to process")
96
+
97
+ if include_tree:
98
+ try:
99
+ with open(output_path, "w", encoding="utf-8") as f:
100
+ tree = generate_tree_visualizer(dir_path, paths)
101
+ f.write(f"{tree}\n")
102
+ logger.debug(f"Directory tree written to {output_path}")
103
+
104
+ except Exception as e:
105
+ logger.error(f"Failed to write tree header to {output_path}: {e}")
106
+ return
107
+
108
+ else:
109
+ output_path.write_text("", encoding="utf-8")
110
+
111
+ paths = [path for path in paths if not path.is_dir()]
112
+
113
+ for path in paths:
114
+ if not is_text_file(
115
+ path,
116
+ validation_func_override=validation_func_override,
117
+ min_encoding_detection_confidence=min_encoding_detection_confidence
118
+ ):
119
+ logger.debug(f"Skipped non-text file: {path}")
120
+ continue
121
+
122
+ append_content(
123
+ root=dir_path,
124
+ file_path=path,
125
+ output_path=output_path,
126
+ prefix=prefix_format,
127
+ suffix=suffix_format,
128
+ read_func_override=read_func_override,
129
+ write_if_empty=write_if_empty
130
+ )
131
+
132
+
133
+ def read_ignore_file(file_path: Path) -> list[str]:
134
+ if not file_path.is_file():
135
+ logger.error(f"Ignore file not found: {file_path}")
136
+ raise FileNotFoundError(f"Ignore file not found: {file_path}")
137
+
138
+ logger.debug(f"Reading ignore patterns from: {file_path}")
139
+ return [line.strip() for line in file_path.read_text(encoding="utf-8").splitlines() if line.strip()]
@@ -0,0 +1,47 @@
1
+ from pathlib import Path
2
+ import fnmatch
3
+ from typing import List
4
+
5
+ from .logger import logger # Importa o logger customizado
6
+
7
+
8
+ def filter_files_by_patterns(dir_path: Path, ignore_patterns: List[str], recursive: bool) -> List[Path]:
9
+ """
10
+ Filters files and directories under a root directory, excluding any that match ignore patterns.
11
+
12
+ Args:
13
+ dir_path (Path): Directory to scan files from.
14
+ ignore_patterns (List[str]): List of glob-style patterns to exclude files and directories.
15
+ recursive (bool): Whether to scan directories recursively.
16
+
17
+ Returns:
18
+ List[Path]: A list of matching file and directory paths.
19
+ """
20
+
21
+ def matches_any_pattern(path: Path) -> bool:
22
+ rel_path = path.relative_to(dir_path).as_posix()
23
+ for pattern in ignore_patterns:
24
+ pat = pattern.rstrip("/")
25
+ if fnmatch.fnmatch(rel_path, pat) or fnmatch.fnmatch(path.name, pat):
26
+ logger.debug(f"Ignoring path '{rel_path}' matched by pattern '{pat}'")
27
+ return True
28
+ return False
29
+
30
+ results = []
31
+
32
+ def scan_dir(directory: Path):
33
+ logger.debug(f"Scanning directory: {directory}")
34
+ try:
35
+ for entry in directory.iterdir():
36
+ if matches_any_pattern(entry):
37
+ continue
38
+
39
+ results.append(entry)
40
+ if recursive and entry.is_dir():
41
+ scan_dir(entry)
42
+ except Exception as e:
43
+ logger.warning(f"Failed to scan directory {directory}: {e}")
44
+
45
+ scan_dir(dir_path)
46
+ logger.debug(f"Total matched files and folders: {len(results)}")
47
+ return results
@@ -0,0 +1,22 @@
1
+ import logging
2
+ import sys
3
+
4
+
5
+ def setup_logger(name: str = "merger", level: int = logging.INFO) -> logging.Logger:
6
+ logger = logging.getLogger(name)
7
+ logger.setLevel(level)
8
+
9
+ if not logger.handlers:
10
+ handler = logging.StreamHandler(sys.stdout)
11
+ formatter = logging.Formatter("[%(levelname)s] %(message)s")
12
+ handler.setFormatter(formatter)
13
+ logger.addHandler(handler)
14
+
15
+ else:
16
+ for handler in logger.handlers:
17
+ handler.setLevel(level)
18
+
19
+ return logger
20
+
21
+
22
+ logger = setup_logger()
@@ -0,0 +1,136 @@
1
+ import importlib.util
2
+ import os
3
+ import platform
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Tuple, Dict, Callable
7
+
8
+ from .logger import logger # Adiciona o logger
9
+
10
+
11
+ def get_readers_folder() -> Path:
12
+ system = platform.system()
13
+ if system == "Windows":
14
+ base = os.getenv("LOCALAPPDATA") or os.getenv("APPDATA") or str(Path.home())
15
+ readers_dir = Path(base) / "Merger" / "installed_readers"
16
+ elif system == "Darwin":
17
+ readers_dir = Path.home() / "Library" / "Application Support" / "Merger" / "installed_readers"
18
+ else:
19
+ readers_dir = Path.home() / ".local" / "share" / "Merger" / "installed_readers"
20
+
21
+ readers_dir.mkdir(parents=True, exist_ok=True)
22
+ logger.debug(f"Readers directory resolved to: {readers_dir}")
23
+ return readers_dir
24
+
25
+
26
+ def register_reader(extension: str, module_path: str):
27
+ if not extension.startswith("."):
28
+ raise ValueError("Extension must start with a dot, e.g. '.pdf'")
29
+
30
+ module_path = Path(module_path).resolve()
31
+ logger.debug(f"Registering reader for {extension} from {module_path}")
32
+
33
+ try:
34
+ spec = importlib.util.spec_from_file_location("temp_custom_reader", module_path)
35
+ module = importlib.util.module_from_spec(spec)
36
+ spec.loader.exec_module(module)
37
+
38
+ expected_all = ["reader"]
39
+ actual_all = getattr(module, "__all__", None)
40
+
41
+ if actual_all != expected_all:
42
+ raise ValueError(f"Invalid module: __all__ must be exactly {expected_all}, got {actual_all}")
43
+
44
+ if not hasattr(module, "validator") or not callable(module.validator):
45
+ raise ValueError("Module must define a callable 'validator'")
46
+
47
+ if not hasattr(module, "reader") or not callable(module.reader):
48
+ raise ValueError("Module must define a callable 'reader'")
49
+
50
+ readers_dir = get_readers_folder()
51
+ dest_path = readers_dir / f"{extension[1:]}.py"
52
+ shutil.copy(module_path, dest_path)
53
+
54
+ logger.info(f"Reader for '{extension}' registered successfully at '{dest_path}'")
55
+
56
+ except Exception as e:
57
+ logger.error(f"Failed to register reader for '{extension}': {e}")
58
+ raise
59
+
60
+
61
+ def unregister_reader(extension: str):
62
+ if not extension.startswith("."):
63
+ raise ValueError("Extension must start with a dot, e.g. '.pdf'")
64
+
65
+ readers_dir = get_readers_folder()
66
+ target = readers_dir / f"{extension[1:]}.py"
67
+
68
+ if target.exists():
69
+ target.unlink()
70
+ logger.info(f"Reader for '{extension}' unregistered (removed): {target}")
71
+ else:
72
+ logger.warning(f"No reader found to unregister for extension: '{extension}'")
73
+
74
+
75
+ def list_readers() -> Dict[str, str]:
76
+ readers_dir = get_readers_folder()
77
+ readers = {
78
+ f".{f.stem}": str(f.resolve())
79
+ for f in readers_dir.glob("*.py")
80
+ }
81
+
82
+ logger.debug(f"Listing installed readers: {readers}")
83
+ return readers
84
+
85
+
86
+ def load_installed_readers() -> Tuple[Dict[str, Callable], Dict[str, Callable]]:
87
+ readers_folder = get_readers_folder()
88
+ validators = {}
89
+ readers = {}
90
+
91
+ for file in readers_folder.glob("*.py"):
92
+ ext = f".{file.stem}"
93
+ logger.debug(f"Loading reader module: {file}")
94
+
95
+ try:
96
+ spec = importlib.util.spec_from_file_location(f"reader_{file.stem}", file)
97
+ module = importlib.util.module_from_spec(spec)
98
+ spec.loader.exec_module(module)
99
+
100
+ all_attr = getattr(module, "__all__", [])
101
+ reader = getattr(module, "reader", None)
102
+ validator = getattr(module, "validator", None)
103
+
104
+ if "reader" not in all_attr or not callable(reader):
105
+ logger.warning(f"Skipping invalid reader module '{file.name}': missing or invalid 'reader'")
106
+ continue
107
+
108
+ if "validator" in all_attr and not callable(validator):
109
+ logger.warning(f"Skipping invalid validator in '{file.name}'")
110
+ continue
111
+
112
+ readers[ext] = reader
113
+ validators[ext] = validator
114
+ logger.debug(f"Loaded reader for {ext}")
115
+
116
+ except Exception as e:
117
+ logger.error(f"Failed to load reader from '{file}': {e}")
118
+
119
+ return readers, validators
120
+
121
+
122
+ def load_custom_readers(module_path: Path) -> Tuple[Dict[str, Callable], Dict[str, Callable]]:
123
+ logger.debug(f"Loading custom readers from: {module_path}")
124
+ try:
125
+ spec = importlib.util.spec_from_file_location("custom_readers_module", module_path)
126
+ module = importlib.util.module_from_spec(spec)
127
+ spec.loader.exec_module(module)
128
+
129
+ readers = getattr(module, "readers", {})
130
+ validators = getattr(module, "validators", {})
131
+ logger.info(f"Custom readers loaded successfully from: {module_path}")
132
+ return readers, validators
133
+
134
+ except Exception as e:
135
+ logger.error(f"Failed to load custom readers from {module_path}: {e}")
136
+ raise
@@ -0,0 +1,59 @@
1
+ from pathlib import Path
2
+ from collections import defaultdict
3
+ from typing import Dict, Union, List
4
+
5
+ _FileTree = Dict[str, Union["_FileTree", None]]
6
+
7
+
8
+ def generate_tree_visualizer(root_path: Path, paths: List[Path]) -> str:
9
+ """
10
+ Generates a visual tree structure of the provided paths relative to a root directory.
11
+
12
+ Args:
13
+ root_path (Path): The root directory.
14
+ paths (List[Path]): List of file and directory paths relative to the root to include in the tree.
15
+
16
+ Returns:
17
+ str: A formatted string representing the directory structure.
18
+ """
19
+ root_path = root_path.resolve()
20
+
21
+ def tree() -> _FileTree:
22
+ return defaultdict(tree)
23
+
24
+ file_tree: _FileTree = tree()
25
+
26
+ for path in paths:
27
+ try:
28
+ relative_parts = path.resolve().relative_to(root_path).parts
29
+ except ValueError:
30
+ continue
31
+
32
+ current = file_tree
33
+ for part in relative_parts:
34
+ current = current[part]
35
+
36
+ def build_tree_str(d: _FileTree, prefix: str = "", current_path: Path = root_path) -> str:
37
+ tree_str = []
38
+ entries = sorted(d.keys())
39
+ for i, key in enumerate(entries, 1):
40
+ is_last = i == len(entries)
41
+ connector = "└── " if is_last else "├── "
42
+
43
+ full_path = current_path / key
44
+
45
+ if full_path.is_dir():
46
+ display_name = f"{key}/"
47
+ else:
48
+ display_name = key
49
+
50
+ tree_str.append(f"{prefix}{connector}{display_name}\n")
51
+
52
+ extension = " " if is_last else "│ "
53
+ tree_str.append(build_tree_str(d[key], prefix + extension, full_path))
54
+
55
+ return "".join(tree_str)
56
+
57
+ tree_output = f"{root_path.name}/\n{build_tree_str(file_tree)}"
58
+ return tree_output
59
+
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.4
2
+ Name: merger-cli
3
+ Version: 1.0.0
4
+ Summary: Merger is a tool that scans a directory, filters files using customizable patterns, and merges readable content into a single output file.
5
+ Author-email: Diogo Toporcov <diogotoporcov@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/diogotoporcov/merger-cli
8
+ Project-URL: Documentation, https://github.com/diogotoporcov/merger-cli
9
+ Keywords: merger,file system,concatenation,automation,development
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: chardet>=5.2.0
19
+ Dynamic: license-file
20
+
21
+ # Merger CLI
22
+
23
+ [![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
24
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
25
+
26
+ Merger is a command-line utility for developers that scans a directory, filters files using customizable ignore patterns, and merges all readable content into a single structured output file. It supports custom file readers and validators, making it easily extendable for formats such as `.ipynb`, `.pdf`, or any specific format.
27
+
28
+ ---
29
+
30
+ ## Summary
31
+
32
+ 1. [Core Features](#core-features)
33
+ 2. [Dependencies](#dependencies)
34
+ 3. [Installation](#installation)
35
+ 4. [Usage](#usage)
36
+ 5. [Custom Readers](#custom-readers)
37
+ 6. [CLI Options](#cli-options)
38
+ 7. [License](#license)
39
+
40
+ ---
41
+
42
+ ## Core Features
43
+
44
+ * **Recursive merge** of all readable text files under a root directory.
45
+ * **Glob-based ignore patterns** using `.gitignore`-style syntax.
46
+ * **Automatic encoding detection**.
47
+ * **Custom file readers and validators** for non-text formats.
48
+ * **CLI support** for installation, removal, and listing of custom readers.
49
+ * **Human-readable merged output**, including a directory tree header and file delimiters.
50
+
51
+ ---
52
+
53
+ ## Dependencies
54
+
55
+ | Component | Version / Type | Notes |
56
+ |-------------|----------------|-----------------------------|
57
+ | **Python** | ≥ 3.8 | Required |
58
+
59
+ All dependencies are listed in [`requirements.txt`](requirements.txt).
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ ### 1. Clone the repository
66
+
67
+ ```bash
68
+ git clone https://github.com/diogotoporcov/merger-cli.git
69
+ cd merger-cli
70
+ ```
71
+
72
+ ### 2. Create and activate a virtual environment
73
+
74
+ **Linux / macOS**
75
+
76
+ ```bash
77
+ python -m venv .venv
78
+ source .venv/bin/activate
79
+ ```
80
+
81
+ **Windows (PowerShell)**
82
+
83
+ ```powershell
84
+ python -m venv .venv
85
+ .venv\Scripts\Activate.ps1
86
+ ```
87
+
88
+ ### 3. Install dependencies
89
+
90
+ ```bash
91
+ pip install -r requirements.txt
92
+ ```
93
+
94
+ ### 4. Install as CLI tool
95
+
96
+ ```bash
97
+ pip install .
98
+ ```
99
+
100
+ This registers the `merger` command globally.
101
+
102
+ ---
103
+
104
+ ## Usage
105
+
106
+ ### Basic merge
107
+
108
+ ```bash
109
+ merger ./src ./merged.txt
110
+ ```
111
+
112
+ ### Custom ignore patterns
113
+
114
+ ```bash
115
+ merger "C:\Users\USER\Desktop\project" "C:\Users\USER\Desktop\project\output.txt" --ignore "*.log" "__pycache__" "*.tmp"
116
+ ```
117
+
118
+ ### Custom ignore file
119
+
120
+ ```bash
121
+ merger . ./output.txt -p ./merger.ignore
122
+ ```
123
+
124
+ ### Include empty files
125
+
126
+ ```bash
127
+ merger ./data ./output.txt --empty
128
+ ```
129
+
130
+ ### Verbose output
131
+
132
+ ```bash
133
+ merger ./src ./merged.txt --log-level DEBUG
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Custom Readers
139
+
140
+ You can extend Merger to handle new file formats.
141
+
142
+ ### Installing a custom reader
143
+
144
+ ```bash
145
+ merger --install .ipynb path/to/ipynb.py
146
+ ```
147
+
148
+ Where `ipynb.py` must define:
149
+
150
+ * ```python
151
+ validator: Callable[[Path], bool]
152
+ ```
153
+ * ```python
154
+ reader: Callable[[Path], str]
155
+ ```
156
+
157
+ To uninstall:
158
+
159
+ ```bash
160
+ merger --uninstall .ipynb
161
+ ```
162
+
163
+ List installed readers:
164
+
165
+ ```bash
166
+ merger --list-installed
167
+ ```
168
+
169
+ An example `.ipynb` reader can be found in
170
+ [`examples/custom_readers/ipynb.py`](examples/custom_readers/ipynb.py).
171
+
172
+ ---
173
+
174
+ ## CLI Options
175
+
176
+ | Option | Description |
177
+ |-------------------------|--------------------------------------------------------------------------------|
178
+ | `--ignore` | List of glob-style ignore patterns. |
179
+ | `-f, --ignore-file` | Path to file containing ignore patterns (Default: `<input_dir>/merger.ignore`. |
180
+ | `-i, --install` | Install a custom reader for an extension. |
181
+ | `-u, --uninstall` | Remove a custom reader (`*` removes all). |
182
+ | `--list-installed` | Show installed readers. |
183
+ | `-l, --log-level` | Set logging verbosity (`DEBUG`, `INFO`, etc.). |
184
+ | `--empty` | Include empty files in merged output. |
185
+ | `--prefix` / `--suffix` | Customize file delimiters in output. |
186
+ | `--overrides` | Load override reader definitions from a Python module. |
187
+ | `--no-tree` | Do not include the generated directory tree in the output file. |
188
+
189
+
190
+ ---
191
+
192
+ ## License
193
+
194
+ This project is licensed under the MIT License — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,22 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ setup.cfg
6
+ examples/__init__.py
7
+ examples/main.example.py
8
+ examples/custom_readers/__init__.py
9
+ examples/custom_readers/ipynb.py
10
+ merger/__init__.py
11
+ merger/cli.py
12
+ merger/files.py
13
+ merger/filtering.py
14
+ merger/logger.py
15
+ merger/registry.py
16
+ merger/tree.py
17
+ merger_cli.egg-info/PKG-INFO
18
+ merger_cli.egg-info/SOURCES.txt
19
+ merger_cli.egg-info/dependency_links.txt
20
+ merger_cli.egg-info/entry_points.txt
21
+ merger_cli.egg-info/requires.txt
22
+ merger_cli.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ merger = merger.cli:main
@@ -0,0 +1 @@
1
+ chardet>=5.2.0
@@ -0,0 +1 @@
1
+ merger
@@ -0,0 +1,33 @@
1
+ [project]
2
+ name = "merger-cli"
3
+ version = "1.0.0"
4
+ description = "Merger is a tool that scans a directory, filters files using customizable patterns, and merges readable content into a single output file."
5
+ keywords = ["merger", "file system", "concatenation", "automation", "development"]
6
+ authors = [
7
+ { name = "Diogo Toporcov", email = "diogotoporcov@gmail.com" }
8
+ ]
9
+ license = { text = "MIT" }
10
+ readme = "README.md"
11
+ requires-python = ">=3.8"
12
+ dependencies = [
13
+ "chardet>=5.2.0"
14
+ ]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Intended Audience :: Developers",
20
+ "Topic :: Software Development :: Libraries :: Python Modules"
21
+ ]
22
+
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/diogotoporcov/merger-cli"
26
+ Documentation = "https://github.com/diogotoporcov/merger-cli"
27
+
28
+ [build-system]
29
+ requires = ["setuptools>=80.9.0", "wheel"]
30
+ build-backend = "setuptools.build_meta"
31
+
32
+ [project.scripts]
33
+ merger = "merger.cli:main"
@@ -0,0 +1,7 @@
1
+ [metadata]
2
+ description-file = README.md
3
+
4
+ [egg_info]
5
+ tag_build =
6
+ tag_date = 0
7
+