tea-data-file-conversion 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+
2
+ MIT License
3
+
4
+ Copyright (c) 2025 Mark Moreno
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.2
2
+ Name: tea-data-file-conversion
3
+ Version: 0.1.1
4
+ Summary: Fixedwidth Processor is a Python package designed to transform fixed-width text files into CSVs using dynamic YAML schema configurations.
5
+ Author-email: Mark Moreno <mamoreno@aldineisd.org>
6
+ License: MIT
7
+ Project-URL: Bug Tracker, https://github.com/markm-io/tea-data-file-conversion/issues
8
+ Project-URL: Changelog, https://github.com/markm-io/tea-data-file-conversion/blob/main/CHANGELOG.md
9
+ Project-URL: documentation, https://tea-data-file-conversion.readthedocs.io
10
+ Project-URL: repository, https://github.com/markm-io/tea-data-file-conversion
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Natural Language :: English
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: importlib-resources>=6.5.2
25
+ Requires-Dist: pandas>=2.2.3
26
+ Requires-Dist: pyyaml>=6.0.2
27
+ Requires-Dist: rich>=10
28
+ Requires-Dist: typer<1,>=0.15
29
+
30
+ # tea-data-file-conversion
31
+
32
+ <p align="center">
33
+ <a href="https://github.com/markm-io/tea-data-file-conversion/actions/workflows/ci.yml?query=branch%3Amain">
34
+ <img src="https://img.shields.io/github/actions/workflow/status/markm-io/tea-data-file-conversion/ci.yml?branch=main&label=CI&logo=github&style=flat-square" alt="CI Status" >
35
+ </a>
36
+ <a href="https://tea-data-file-conversion.readthedocs.io">
37
+ <img src="https://img.shields.io/readthedocs/tea-data-file-conversion.svg?logo=read-the-docs&logoColor=fff&style=flat-square" alt="Documentation Status">
38
+ </a>
39
+ <a href="https://codecov.io/gh/markm-io/tea-data-file-conversion">
40
+ <img src="https://img.shields.io/codecov/c/github/markm-io/tea-data-file-conversion.svg?logo=codecov&logoColor=fff&style=flat-square" alt="Test coverage percentage">
41
+ </a>
42
+ </p>
43
+ <p align="center">
44
+ <a href="https://github.com/astral-sh/uv">
45
+ <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json" alt="uv">
46
+ </a>
47
+ <a href="https://github.com/astral-sh/ruff">
48
+ <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json" alt="Ruff">
49
+ </a>
50
+ <a href="https://github.com/pre-commit/pre-commit">
51
+ <img src="https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white&style=flat-square" alt="pre-commit">
52
+ </a>
53
+ </p>
54
+ <p align="center">
55
+ <a href="https://pypi.org/project/tea-data-file-conversion/">
56
+ <img src="https://img.shields.io/pypi/v/tea-data-file-conversion.svg?logo=python&logoColor=fff&style=flat-square" alt="PyPI Version">
57
+ </a>
58
+ <img src="https://img.shields.io/pypi/pyversions/tea-data-file-conversion.svg?style=flat-square&logo=python&amp;logoColor=fff" alt="Supported Python versions">
59
+ <img src="https://img.shields.io/pypi/l/tea-data-file-conversion.svg?style=flat-square" alt="License">
60
+ </p>
61
+
62
+ ---
63
+
64
+ **Documentation**: <a href="https://tea-data-file-conversion.readthedocs.io" target="_blank">https://tea-data-file-conversion.readthedocs.io </a>
65
+
66
+ **Source Code**: <a href="https://github.com/markm-io/tea-data-file-conversion" target="_blank">https://github.com/markm-io/tea-data-file-conversion </a>
67
+
68
+ ---
69
+
70
+ Fixedwidth Processor is a Python package designed to transform fixed-width text files into CSVs using dynamic YAML schema configurations.
71
+
72
+ ## Installation
73
+
74
+ Install this via pip (or your favourite package manager):
75
+
76
+ `pip install tea-data-file-conversion`
77
+
78
+ ## Contributors ✨
79
+
80
+ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
81
+
82
+ <!-- prettier-ignore-start -->
83
+ <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
84
+ <!-- prettier-ignore-start -->
85
+ <!-- markdownlint-disable -->
86
+ <table>
87
+ <tbody>
88
+ <tr>
89
+ <td align="center" valign="top" width="14.28%"><a href="https://github.com/markm-io"><img src="https://avatars.githubusercontent.com/u/45011486?v=4?s=80" width="80px;" alt="Mark Moreno"/><br /><sub><b>Mark Moreno</b></sub></a><br /><a href="https://github.com/markm-io/tea-data-file-conversion/commits?author=markm-io" title="Code">💻</a> <a href="#ideas-markm-io" title="Ideas, Planning, & Feedback">🤔</a> <a href="https://github.com/markm-io/tea-data-file-conversion/commits?author=markm-io" title="Documentation">📖</a></td>
90
+ </tr>
91
+ </tbody>
92
+ </table>
93
+
94
+ <!-- markdownlint-restore -->
95
+ <!-- prettier-ignore-end -->
96
+
97
+ <!-- ALL-CONTRIBUTORS-LIST:END -->
98
+ <!-- prettier-ignore-end -->
99
+
100
+ This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
101
+
102
+ ## Credits
103
+
104
+ [![Copier](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/copier-org/copier/master/img/badge/badge-grayscale-inverted-border-orange.json)](https://github.com/copier-org/copier)
105
+
106
+ This package was created with
107
+ [Copier](https://copier.readthedocs.io/) and the
108
+ [browniebroke/pypackage-template](https://github.com/browniebroke/pypackage-template)
109
+ project template.
@@ -0,0 +1,80 @@
1
+ # tea-data-file-conversion
2
+
3
+ <p align="center">
4
+ <a href="https://github.com/markm-io/tea-data-file-conversion/actions/workflows/ci.yml?query=branch%3Amain">
5
+ <img src="https://img.shields.io/github/actions/workflow/status/markm-io/tea-data-file-conversion/ci.yml?branch=main&label=CI&logo=github&style=flat-square" alt="CI Status" >
6
+ </a>
7
+ <a href="https://tea-data-file-conversion.readthedocs.io">
8
+ <img src="https://img.shields.io/readthedocs/tea-data-file-conversion.svg?logo=read-the-docs&logoColor=fff&style=flat-square" alt="Documentation Status">
9
+ </a>
10
+ <a href="https://codecov.io/gh/markm-io/tea-data-file-conversion">
11
+ <img src="https://img.shields.io/codecov/c/github/markm-io/tea-data-file-conversion.svg?logo=codecov&logoColor=fff&style=flat-square" alt="Test coverage percentage">
12
+ </a>
13
+ </p>
14
+ <p align="center">
15
+ <a href="https://github.com/astral-sh/uv">
16
+ <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json" alt="uv">
17
+ </a>
18
+ <a href="https://github.com/astral-sh/ruff">
19
+ <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json" alt="Ruff">
20
+ </a>
21
+ <a href="https://github.com/pre-commit/pre-commit">
22
+ <img src="https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white&style=flat-square" alt="pre-commit">
23
+ </a>
24
+ </p>
25
+ <p align="center">
26
+ <a href="https://pypi.org/project/tea-data-file-conversion/">
27
+ <img src="https://img.shields.io/pypi/v/tea-data-file-conversion.svg?logo=python&logoColor=fff&style=flat-square" alt="PyPI Version">
28
+ </a>
29
+ <img src="https://img.shields.io/pypi/pyversions/tea-data-file-conversion.svg?style=flat-square&logo=python&amp;logoColor=fff" alt="Supported Python versions">
30
+ <img src="https://img.shields.io/pypi/l/tea-data-file-conversion.svg?style=flat-square" alt="License">
31
+ </p>
32
+
33
+ ---
34
+
35
+ **Documentation**: <a href="https://tea-data-file-conversion.readthedocs.io" target="_blank">https://tea-data-file-conversion.readthedocs.io </a>
36
+
37
+ **Source Code**: <a href="https://github.com/markm-io/tea-data-file-conversion" target="_blank">https://github.com/markm-io/tea-data-file-conversion </a>
38
+
39
+ ---
40
+
41
+ Fixedwidth Processor is a Python package designed to transform fixed-width text files into CSVs using dynamic YAML schema configurations.
42
+
43
+ ## Installation
44
+
45
+ Install this via pip (or your favourite package manager):
46
+
47
+ `pip install tea-data-file-conversion`
48
+
49
+ ## Contributors ✨
50
+
51
+ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
52
+
53
+ <!-- prettier-ignore-start -->
54
+ <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
55
+ <!-- prettier-ignore-start -->
56
+ <!-- markdownlint-disable -->
57
+ <table>
58
+ <tbody>
59
+ <tr>
60
+ <td align="center" valign="top" width="14.28%"><a href="https://github.com/markm-io"><img src="https://avatars.githubusercontent.com/u/45011486?v=4?s=80" width="80px;" alt="Mark Moreno"/><br /><sub><b>Mark Moreno</b></sub></a><br /><a href="https://github.com/markm-io/tea-data-file-conversion/commits?author=markm-io" title="Code">💻</a> <a href="#ideas-markm-io" title="Ideas, Planning, & Feedback">🤔</a> <a href="https://github.com/markm-io/tea-data-file-conversion/commits?author=markm-io" title="Documentation">📖</a></td>
61
+ </tr>
62
+ </tbody>
63
+ </table>
64
+
65
+ <!-- markdownlint-restore -->
66
+ <!-- prettier-ignore-end -->
67
+
68
+ <!-- ALL-CONTRIBUTORS-LIST:END -->
69
+ <!-- prettier-ignore-end -->
70
+
71
+ This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
72
+
73
+ ## Credits
74
+
75
+ [![Copier](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/copier-org/copier/master/img/badge/badge-grayscale-inverted-border-orange.json)](https://github.com/copier-org/copier)
76
+
77
+ This package was created with
78
+ [Copier](https://copier.readthedocs.io/) and the
79
+ [browniebroke/pypackage-template](https://github.com/browniebroke/pypackage-template)
80
+ project template.
@@ -0,0 +1,170 @@
1
+ [build-system]
2
+ build-backend = "setuptools.build_meta"
3
+ requires = [ "setuptools" ]
4
+
5
+ [project]
6
+ name = "tea-data-file-conversion"
7
+ version = "0.1.1"
8
+ description = "Fixedwidth Processor is a Python package designed to transform fixed-width text files into CSVs using dynamic YAML schema configurations."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [
12
+ { name = "Mark Moreno", email = "mamoreno@aldineisd.org" },
13
+ ]
14
+ requires-python = ">=3.9"
15
+ classifiers = [
16
+ "Development Status :: 2 - Pre-Alpha",
17
+ "Intended Audience :: Developers",
18
+ "Natural Language :: English",
19
+ "Operating System :: OS Independent",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Software Development :: Libraries",
26
+ ]
27
+
28
+ dependencies = [
29
+ "importlib-resources>=6.5.2",
30
+ "pandas>=2.2.3",
31
+ "pyyaml>=6.0.2",
32
+ "rich>=10",
33
+ "typer>=0.15,<1",
34
+ ]
35
+ urls."Bug Tracker" = "https://github.com/markm-io/tea-data-file-conversion/issues"
36
+ urls.Changelog = "https://github.com/markm-io/tea-data-file-conversion/blob/main/CHANGELOG.md"
37
+ urls.documentation = "https://tea-data-file-conversion.readthedocs.io"
38
+ urls.repository = "https://github.com/markm-io/tea-data-file-conversion"
39
+ scripts.tea-data-file-conversion = "tea_data_file_conversion.cli:app"
40
+
41
+ [dependency-groups]
42
+ dev = [
43
+ "pytest>=8,<9",
44
+ "pytest-cov>=6,<7",
45
+ ]
46
+ docs = [
47
+ "myst-parser>=0.16; python_version>='3.11'",
48
+ "sphinx>=4; python_version>='3.11'",
49
+ "sphinx-autobuild>=2024,<2025; python_version>='3.11'",
50
+ "sphinx-wagtail-theme>=6.5.0",
51
+ ]
52
+
53
+ [tool.ruff]
54
+ target-version = "py39"
55
+ line-length = 120
56
+ lint.select = [
57
+ "B", # flake8-bugbearlear
58
+ "C4", # flake8-comprehensions
59
+ "S", # flake8-bandit
60
+ "F", # pyflake
61
+ "E", # pycodestyle
62
+ "W", # pycodestyle
63
+ "UP", # pyupgrade
64
+ "I", # isort
65
+ "RUF", # ruff specific
66
+ ]
67
+ lint.ignore = [
68
+ "D203", # 1 blank line required before class docstring
69
+ "D212", # Multi-line docstring summary should start at the first line
70
+ "D100", # Missing docstring in public module
71
+ "D104", # Missing docstring in public package
72
+ "D107", # Missing docstring in `__init__`
73
+ "D401", # First line of docstring should be in imperative mood
74
+ "S324", # Use of insecure MD2, MD4, MD5, or SHA1 hash function
75
+ ]
76
+ lint.per-file-ignores."conftest.py" = [ "D100" ]
77
+ lint.per-file-ignores."docs/conf.py" = [ "D100" ]
78
+ lint.per-file-ignores."setup.py" = [ "D100" ]
79
+ lint.per-file-ignores."tests/**/*" = [
80
+ "D100",
81
+ "D101",
82
+ "D102",
83
+ "D103",
84
+ "D104",
85
+ "S101",
86
+ ]
87
+ lint.isort.known-first-party = [ "tea_data_file_conversion", "tests" ]
88
+ exclude = [
89
+ "docs/conf.py",
90
+ ]
91
+
92
+ [tool.pytest.ini_options]
93
+ addopts = """\
94
+ -v
95
+ -Wdefault
96
+ --cov=tea_data_file_conversion
97
+ --cov-report=term
98
+ --cov-report=xml
99
+ """
100
+ pythonpath = [ "src" ]
101
+
102
+ [tool.coverage.run]
103
+ branch = true
104
+
105
+ [tool.coverage.report]
106
+ exclude_lines = [
107
+ "pragma: no cover",
108
+ "@overload",
109
+ "if TYPE_CHECKING",
110
+ "raise NotImplementedError",
111
+ 'if __name__ == "__main__":',
112
+ ]
113
+
114
+ [tool.mypy]
115
+ check_untyped_defs = true
116
+ disallow_any_generics = true
117
+ disallow_incomplete_defs = true
118
+ disallow_untyped_defs = true
119
+ mypy_path = "src/"
120
+ no_implicit_optional = true
121
+ show_error_codes = true
122
+ warn_unreachable = true
123
+ warn_unused_ignores = true
124
+ exclude = [
125
+ 'docs/.*',
126
+ 'setup.py',
127
+ ]
128
+
129
+ [[tool.mypy.overrides]]
130
+ module = "tests.*"
131
+ allow_untyped_defs = true
132
+
133
+ [[tool.mypy.overrides]]
134
+ module = "docs.*"
135
+ ignore_errors = true
136
+
137
+ [tool.semantic_release]
138
+ version_toml = [ "pyproject.toml:project.version" ]
139
+ version_variables = [
140
+ "src/tea_data_file_conversion/__init__.py:__version__",
141
+ "docs/conf.py:release",
142
+ ]
143
+ build_command = """
144
+ pip install uv
145
+ uv lock
146
+ git add uv.lock
147
+ uv build
148
+ """
149
+
150
+ [tool.semantic_release.changelog]
151
+ exclude_commit_patterns = [
152
+ '''chore(?:\([^)]*?\))?: .+''',
153
+ '''ci(?:\([^)]*?\))?: .+''',
154
+ '''refactor(?:\([^)]*?\))?: .+''',
155
+ '''style(?:\([^)]*?\))?: .+''',
156
+ '''test(?:\([^)]*?\))?: .+''',
157
+ '''build\((?!deps\): .+)''',
158
+ '''Merged? .*''',
159
+ '''Initial [Cc]ommit.*''', # codespell:ignore
160
+ ]
161
+
162
+ [tool.semantic_release.changelog.environment]
163
+ keep_trailing_newline = true
164
+
165
+ [tool.semantic_release.branches.main]
166
+ match = "main"
167
+
168
+ [tool.semantic_release.branches.noop]
169
+ match = "(?!main$)"
170
+ prerelease = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env python
2
+
3
+ # This is a shim to allow GitHub to detect the package, build is done with uv
4
+ # Taken from https://github.com/Textualize/rich
5
+
6
+ import setuptools
7
+
8
+ if __name__ == "__main__":
9
+ setuptools.setup(name="tea-data-file-conversion")
@@ -0,0 +1,7 @@
1
+ __version__ = "0.1.1"
2
+
3
+ # fixedwidth_processor/__init__.py
4
+
5
+ from .processor import export_templates, process_file, validate_yaml_config
6
+
7
+ __all__ = ["export_templates", "process_file", "validate_yaml_config"]
@@ -0,0 +1,63 @@
1
+ # file: src/tea_data_file_conversion/cli.py
2
+
3
+ r"""Command-line interface for fixed\-width file processing.
4
+
5
+ This module provides an entry point to either process a fixed\-width file
6
+ into CSV format using a dynamic YAML schema or export default YAML templates.
7
+ """
8
+
9
+ import argparse
10
+
11
+ from .processor import export_templates, process_file
12
+
13
+
14
+ def main():
15
+ r"""Parse command\-line arguments and execute the corresponding action.
16
+
17
+ Options:
18
+ \- Process a fixed\-width file to CSV.
19
+ \- Export YAML template files if the --export_templates flag is set.
20
+ """
21
+ # Set up the argument parser.
22
+ parser = argparse.ArgumentParser(
23
+ description=r"Process a fixed\-width file and output a CSV based on dynamic YAML schema."
24
+ )
25
+ # Input file (required).
26
+ parser.add_argument("input_file", help=r"Path to the input fixed\-width file.")
27
+ # Optional output file.
28
+ parser.add_argument(
29
+ "--output_file",
30
+ help=(
31
+ "Optional path for the output CSV file. "
32
+ "If not provided, defaults to the input file name with '_output.csv' appended."
33
+ ),
34
+ default=None,
35
+ )
36
+ # Optional schema folder location.
37
+ parser.add_argument(
38
+ "--schema_folder",
39
+ help="Path to the folder containing YAML schema files "
40
+ "(or where templates will be exported). Defaults to current directory.",
41
+ default=".",
42
+ )
43
+ # Flag to export templates.
44
+ parser.add_argument(
45
+ "--export_templates",
46
+ help=r"Export template YAML files from the built\-in "
47
+ r"default_schema folder to the specified schema_folder and exit.",
48
+ action="store_true",
49
+ )
50
+
51
+ # Parse the provided arguments.
52
+ args = parser.parse_args()
53
+
54
+ # If the export flag is set, export YAML templates and exit immediately.
55
+ if args.export_templates:
56
+ export_templates(args.schema_folder)
57
+
58
+ # Otherwise, process the file using the processed arguments.
59
+ process_file(args.input_file, args.output_file, schema_folder=args.schema_folder)
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
@@ -0,0 +1,340 @@
1
+ # file: src/tea_data_file_conversion/processor.py
2
+
3
+ r"""Processor module for fixed\-width file conversion.
4
+
5
+ This module provides functions to:
6
+ \- Load and validate YAML schema configurations.
7
+ \- Process fixed\-width files into structured DataFrame objects.
8
+ \- Export template YAML schema files.
9
+ \- Convert CSV files into YAML schema files interactively.
10
+ """
11
+
12
+ import os
13
+ import shutil
14
+ import sys
15
+
16
+ import importlib_resources # Used to locate package data.
17
+ import pandas as pd
18
+ import yaml
19
+
20
+
21
+ def load_yaml_config(file_path):
22
+ """
23
+ Load a YAML configuration file for processing.
24
+
25
+ Parameters
26
+ ----------
27
+ file_path : str
28
+ The path to the YAML configuration file.
29
+
30
+ Returns
31
+ -------
32
+ dict
33
+ The parsed YAML configuration.
34
+
35
+ Raises
36
+ ------
37
+ ValueError
38
+ If there is an error parsing the YAML file.
39
+ """
40
+ try:
41
+ with open(file_path) as f:
42
+ config = yaml.safe_load(f)
43
+ return config
44
+ except yaml.YAMLError as ye:
45
+ # Raise an error with details of parsing issues.
46
+ raise ValueError(f"Error parsing YAML file {file_path}: {ye}") from ye
47
+
48
+
49
+ def validate_yaml_config(config, file_path):
50
+ """
51
+ Validate the structure of the YAML configuration.
52
+
53
+ The configuration must be a dictionary containing a key 'fields' mapping to a list.
54
+ Each field in the list must contain 'start', 'end', and 'output_field' keys.
55
+
56
+ Parameters
57
+ ----------
58
+ config : dict
59
+ The YAML configuration dictionary.
60
+ file_path : str
61
+ File path used for reporting in error messages.
62
+
63
+ Raises
64
+ ------
65
+ ValueError
66
+ If the configuration does not adhere to the expected schema.
67
+ """
68
+ if not isinstance(config, dict):
69
+ raise ValueError(f"YAML file {file_path} should be a dictionary at the top level.")
70
+ if "fields" not in config:
71
+ raise ValueError(f"YAML file {file_path} is missing the required key 'fields'.")
72
+ if not isinstance(config["fields"], list):
73
+ raise ValueError(f"YAML file {file_path} key 'fields' should be a list.")
74
+
75
+ for index, field in enumerate(config["fields"]):
76
+ if not isinstance(field, dict):
77
+ raise ValueError(f"YAML file {file_path}, field at index {index} is not a dictionary.")
78
+ for key in ["start", "end", "output_field"]:
79
+ if key not in field:
80
+ raise ValueError(f"YAML file {file_path}, field at index {index} is missing required key '{key}'.")
81
+ if not isinstance(field["start"], int):
82
+ raise ValueError(f"YAML file {file_path}, field at index {index} key 'start' must be an integer.")
83
+ if not isinstance(field["end"], int):
84
+ raise ValueError(f"YAML file {file_path}, field at index {index} key 'end' must be an integer.")
85
+ if not isinstance(field["output_field"], str):
86
+ raise ValueError(f"YAML file {file_path}, field at index {index} key 'output_field' must be a string.")
87
+ if "keep" in field and not isinstance(field["keep"], bool):
88
+ raise ValueError(f"YAML file {file_path}, field at index {index} key 'keep' must be a boolean.")
89
+
90
+
91
+ def process_fixed_width_file(input_file, schema_config, skip_header=False, filter_columns=False):
92
+ r"""
93
+ Process a fixed\-width file using the provided YAML schema configuration.
94
+
95
+ It determines column boundaries based on the schema, reads the file using pandas,
96
+ and applies optional filtering to only return columns marked to be kept.
97
+
98
+ Parameters
99
+ ----------
100
+ input_file : str
101
+ The path to the fixed\-width text file.
102
+ schema_config : dict
103
+ Schema configuration dictionary with field definitions.
104
+ skip_header : bool, optional
105
+ Skip the header row if True (default is False).
106
+ filter_columns : bool, optional
107
+ If True, return only DataFrame columns that are marked with "keep": true.
108
+
109
+ Returns
110
+ -------
111
+ pd.DataFrame
112
+ DataFrame with the processed data.
113
+ """
114
+ fields = schema_config["fields"]
115
+ colspecs = [] # List of tuples defining start and end positions for each field.
116
+ col_names = [] # List of column names derived from the schema.
117
+ keep_columns = [] # Track columns flagged to be retained.
118
+
119
+ for field in fields:
120
+ # Adjust the start position because the schema uses 1-based indexing.
121
+ start = field["start"] - 1
122
+ end = field["end"]
123
+ colspecs.append((start, end))
124
+ # Use 'mapped_field_name' when filtering columns if available.
125
+ if filter_columns:
126
+ col_name = (
127
+ field["mapped_field_name"] if not pd.isna(field.get("mapped_field_name")) else field["output_field"]
128
+ )
129
+ else:
130
+ col_name = field["output_field"]
131
+ col_names.append(col_name)
132
+ if field.get("keep", False):
133
+ keep_columns.append(col_name)
134
+
135
+ # Ensure each column name is unique by appending a counter if needed.
136
+ unique_col_names = []
137
+ for col_name in col_names:
138
+ if col_name in unique_col_names:
139
+ count = 1
140
+ new_col_name = f"{col_name}_{count}"
141
+ while new_col_name in unique_col_names:
142
+ count += 1
143
+ new_col_name = f"{col_name}_{count}"
144
+ unique_col_names.append(new_col_name)
145
+ else:
146
+ unique_col_names.append(col_name)
147
+
148
+ # Read the fixed\-width file into a DataFrame.
149
+ df = pd.read_fwf(input_file, colspecs=colspecs, header=None, names=unique_col_names)
150
+
151
+ if filter_columns:
152
+ df = df[keep_columns]
153
+
154
+ return df
155
+
156
+
157
+ def process_file(input_file, output_file=None, schema_folder=None, filter_columns=False):
158
+ r"""
159
+ Process an input fixed\-width file and output a CSV file.
160
+
161
+ The function:
162
+ \- Determines the appropriate YAML schema based on header info.
163
+ \- Loads and validates the schema.
164
+ \- Processes the input file and writes the output DataFrame to CSV.
165
+
166
+ Parameters
167
+ ----------
168
+ input_file : str
169
+ The path to the fixed\-width input file.
170
+ output_file : str, optional
171
+ File path for the output CSV. Defaults to input file name with '_output.csv' appended.
172
+ schema_folder : str, optional
173
+ Folder where the YAML schema files are located; defaults to the current folder.
174
+ filter_columns : bool, optional
175
+ If True, only load columns flagged with "keep": true (default is False).
176
+
177
+ Returns
178
+ -------
179
+ pd.DataFrame
180
+ The processed DataFrame.
181
+ """
182
+ # Define the output CSV file name if not explicitly provided.
183
+ if output_file is None:
184
+ base, _ = os.path.splitext(input_file)
185
+ output_file = f"{base}_output.csv"
186
+
187
+ # Read and validate the header line.
188
+ with open(input_file) as f:
189
+ header_line = f.readline().strip()
190
+
191
+ if len(header_line) < 4:
192
+ raise ValueError("The header line must contain at least 4 characters.")
193
+
194
+ # Extract test month and abbreviated school year from header.
195
+ header = header_line[:4]
196
+ test_month = int(header[:2])
197
+ school_year_abbr = int(header[2:4])
198
+ full_school_year = 2000 + school_year_abbr
199
+
200
+ # Determine test type and adjust school year if necessary.
201
+ if test_month < 10:
202
+ test_name = "staar"
203
+ else:
204
+ test_name = "staar_eoc"
205
+ if test_month < 15:
206
+ full_school_year += 1
207
+
208
+ # Compose the path to the expected YAML schema file.
209
+ base_folder = schema_folder if schema_folder is not None else "default_schema"
210
+ schema_config_file = os.path.join(base_folder, test_name, f"{test_name}_{full_school_year}.yaml")
211
+ print(f"Loading schema config: {schema_config_file}")
212
+
213
+ # Load and validate the YAML configuration.
214
+ schema_config = load_yaml_config(schema_config_file)
215
+ try:
216
+ validate_yaml_config(schema_config, schema_config_file)
217
+ except ValueError as ve:
218
+ print(f"YAML validation error: {ve}")
219
+ sys.exit(1)
220
+
221
+ # Process the file using the loaded schema.
222
+ df = process_fixed_width_file(input_file, schema_config, skip_header=True, filter_columns=filter_columns)
223
+
224
+ # Write the processed data to a CSV file.
225
+ df.to_csv(output_file, index=False)
226
+ print(f"Data has been written to {output_file}")
227
+ return df
228
+
229
+
230
+ def export_templates(schema_folder):
231
+ r"""
232
+ Export sample YAML template files to a specified folder.
233
+
234
+ The function copies files from the built\-in default_schema directory
235
+ (packaged with this module) into the target folder while preserving the
236
+ original directory structure.
237
+
238
+ Parameters
239
+ ----------
240
+ schema_folder : str
241
+ The destination folder for exporting the template YAML files.
242
+
243
+ Notes
244
+ -----
245
+ The function exits after exporting the template files.
246
+ """
247
+ # Locate the default_schema folder within the package.
248
+ with importlib_resources.path("fixedwidth_processor", "default_schema") as default_schema_path:
249
+ # Check if the default_schema_path is a valid directory.
250
+ if not os.path.isdir(str(default_schema_path)):
251
+ print("Default schema folder not found in package.")
252
+ sys.exit(1)
253
+
254
+ # Walk the directory using the string version of the path.
255
+ for root, _dirs, files in os.walk(str(default_schema_path)):
256
+ for file in files:
257
+ rel_path = os.path.relpath(os.path.join(root, file), str(default_schema_path))
258
+ target_file = os.path.join(schema_folder, rel_path)
259
+ os.makedirs(os.path.dirname(target_file), exist_ok=True)
260
+ shutil.copy(os.path.join(root, file), target_file)
261
+ print(f"Template YAML files exported to {schema_folder}.")
262
+ print(
263
+ "Please review and update the templates as needed, then run the script again using the --schema_folder option."
264
+ )
265
+ sys.exit(0)
266
+
267
+
268
+ def csv_to_schema_yaml(csv_file, yaml_output_file=None):
269
+ r"""
270
+ Convert a CSV file into a YAML schema file for fixed\-width processing.
271
+
272
+ This function loads a CSV file, lists available columns, and interactively
273
+ prompts the user to select fields corresponding to start, end, and output
274
+ values, then writes out a YAML file with the chosen configuration.
275
+
276
+ Parameters
277
+ ----------
278
+ csv_file : str
279
+ Path to the input CSV file.
280
+ yaml_output_file : str, optional
281
+ Output file path for the YAML schema. If omitted, a default name is generated.
282
+ """
283
+ try:
284
+ df = pd.read_csv(csv_file)
285
+ except Exception as e:
286
+ print(f"Error loading CSV file: {e}")
287
+ return
288
+
289
+ # Display available CSV columns for user selection.
290
+ print("Available columns in the CSV:")
291
+ for col in df.columns:
292
+ print(f" - {col}")
293
+
294
+ # Request the user to enter the necessary columns.
295
+ start_col = input("Enter the name of the column representing the start value: ").strip()
296
+ end_col = input("Enter the name of the column representing the end value: ").strip()
297
+ output_field_col = input(
298
+ "Enter the name of the column representing the output field (e.g., 'Field Category - Field Title'): "
299
+ ).strip()
300
+
301
+ fields = [] # Prepare a list for schema field definitions.
302
+ for index, row in df.iterrows():
303
+ try:
304
+ start_value = int(row[start_col])
305
+ except (ValueError, TypeError):
306
+ print(f"Row {index}: Could not convert start value '{row[start_col]}' to int. Skipping this row.")
307
+ continue
308
+
309
+ try:
310
+ end_value = int(row[end_col])
311
+ except (ValueError, TypeError):
312
+ print(f"Row {index}: Could not convert end value '{row[end_col]}' to int. Skipping this row.")
313
+ continue
314
+
315
+ # Clean the output field by replacing special dash characters.
316
+ output_field_value = (
317
+ str(row[output_field_col]).replace("\u2010", "-").replace("\u2013", "-").replace("\n", "").replace("\r", "")
318
+ )
319
+ field_entry = {
320
+ "start": start_value,
321
+ "end": end_value,
322
+ "output_field": output_field_value,
323
+ "keep": row.get("keep", False),
324
+ "mapped_field_name": row.get("Mapped Field Title", output_field_value),
325
+ }
326
+ fields.append(field_entry)
327
+
328
+ data = {"fields": fields}
329
+
330
+ # Set default output YAML file name if none provided.
331
+ if yaml_output_file is None:
332
+ base, _ = os.path.splitext(csv_file)
333
+ yaml_output_file = f"{base}_schema.yaml"
334
+
335
+ try:
336
+ with open(yaml_output_file, "w") as f:
337
+ yaml.dump(data, f, sort_keys=False)
338
+ print(f"Schema YAML file successfully created: {yaml_output_file}")
339
+ except Exception as e:
340
+ print(f"Error writing YAML file: {e}")
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.2
2
+ Name: tea-data-file-conversion
3
+ Version: 0.1.1
4
+ Summary: Fixedwidth Processor is a Python package designed to transform fixed-width text files into CSVs using dynamic YAML schema configurations.
5
+ Author-email: Mark Moreno <mamoreno@aldineisd.org>
6
+ License: MIT
7
+ Project-URL: Bug Tracker, https://github.com/markm-io/tea-data-file-conversion/issues
8
+ Project-URL: Changelog, https://github.com/markm-io/tea-data-file-conversion/blob/main/CHANGELOG.md
9
+ Project-URL: documentation, https://tea-data-file-conversion.readthedocs.io
10
+ Project-URL: repository, https://github.com/markm-io/tea-data-file-conversion
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Natural Language :: English
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: importlib-resources>=6.5.2
25
+ Requires-Dist: pandas>=2.2.3
26
+ Requires-Dist: pyyaml>=6.0.2
27
+ Requires-Dist: rich>=10
28
+ Requires-Dist: typer<1,>=0.15
29
+
30
+ # tea-data-file-conversion
31
+
32
+ <p align="center">
33
+ <a href="https://github.com/markm-io/tea-data-file-conversion/actions/workflows/ci.yml?query=branch%3Amain">
34
+ <img src="https://img.shields.io/github/actions/workflow/status/markm-io/tea-data-file-conversion/ci.yml?branch=main&label=CI&logo=github&style=flat-square" alt="CI Status" >
35
+ </a>
36
+ <a href="https://tea-data-file-conversion.readthedocs.io">
37
+ <img src="https://img.shields.io/readthedocs/tea-data-file-conversion.svg?logo=read-the-docs&logoColor=fff&style=flat-square" alt="Documentation Status">
38
+ </a>
39
+ <a href="https://codecov.io/gh/markm-io/tea-data-file-conversion">
40
+ <img src="https://img.shields.io/codecov/c/github/markm-io/tea-data-file-conversion.svg?logo=codecov&logoColor=fff&style=flat-square" alt="Test coverage percentage">
41
+ </a>
42
+ </p>
43
+ <p align="center">
44
+ <a href="https://github.com/astral-sh/uv">
45
+ <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json" alt="uv">
46
+ </a>
47
+ <a href="https://github.com/astral-sh/ruff">
48
+ <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json" alt="Ruff">
49
+ </a>
50
+ <a href="https://github.com/pre-commit/pre-commit">
51
+ <img src="https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white&style=flat-square" alt="pre-commit">
52
+ </a>
53
+ </p>
54
+ <p align="center">
55
+ <a href="https://pypi.org/project/tea-data-file-conversion/">
56
+ <img src="https://img.shields.io/pypi/v/tea-data-file-conversion.svg?logo=python&logoColor=fff&style=flat-square" alt="PyPI Version">
57
+ </a>
58
+ <img src="https://img.shields.io/pypi/pyversions/tea-data-file-conversion.svg?style=flat-square&logo=python&amp;logoColor=fff" alt="Supported Python versions">
59
+ <img src="https://img.shields.io/pypi/l/tea-data-file-conversion.svg?style=flat-square" alt="License">
60
+ </p>
61
+
62
+ ---
63
+
64
+ **Documentation**: <a href="https://tea-data-file-conversion.readthedocs.io" target="_blank">https://tea-data-file-conversion.readthedocs.io </a>
65
+
66
+ **Source Code**: <a href="https://github.com/markm-io/tea-data-file-conversion" target="_blank">https://github.com/markm-io/tea-data-file-conversion </a>
67
+
68
+ ---
69
+
70
+ Fixedwidth Processor is a Python package designed to transform fixed-width text files into CSVs using dynamic YAML schema configurations.
71
+
72
+ ## Installation
73
+
74
+ Install this via pip (or your favourite package manager):
75
+
76
+ `pip install tea-data-file-conversion`
77
+
78
+ ## Contributors ✨
79
+
80
+ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
81
+
82
+ <!-- prettier-ignore-start -->
83
+ <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
84
+ <!-- prettier-ignore-start -->
85
+ <!-- markdownlint-disable -->
86
+ <table>
87
+ <tbody>
88
+ <tr>
89
+ <td align="center" valign="top" width="14.28%"><a href="https://github.com/markm-io"><img src="https://avatars.githubusercontent.com/u/45011486?v=4?s=80" width="80px;" alt="Mark Moreno"/><br /><sub><b>Mark Moreno</b></sub></a><br /><a href="https://github.com/markm-io/tea-data-file-conversion/commits?author=markm-io" title="Code">💻</a> <a href="#ideas-markm-io" title="Ideas, Planning, & Feedback">🤔</a> <a href="https://github.com/markm-io/tea-data-file-conversion/commits?author=markm-io" title="Documentation">📖</a></td>
90
+ </tr>
91
+ </tbody>
92
+ </table>
93
+
94
+ <!-- markdownlint-restore -->
95
+ <!-- prettier-ignore-end -->
96
+
97
+ <!-- ALL-CONTRIBUTORS-LIST:END -->
98
+ <!-- prettier-ignore-end -->
99
+
100
+ This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
101
+
102
+ ## Credits
103
+
104
+ [![Copier](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/copier-org/copier/master/img/badge/badge-grayscale-inverted-border-orange.json)](https://github.com/copier-org/copier)
105
+
106
+ This package was created with
107
+ [Copier](https://copier.readthedocs.io/) and the
108
+ [browniebroke/pypackage-template](https://github.com/browniebroke/pypackage-template)
109
+ project template.
@@ -0,0 +1,16 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ src/tea_data_file_conversion/__init__.py
6
+ src/tea_data_file_conversion/cli.py
7
+ src/tea_data_file_conversion/processor.py
8
+ src/tea_data_file_conversion/py.typed
9
+ src/tea_data_file_conversion.egg-info/PKG-INFO
10
+ src/tea_data_file_conversion.egg-info/SOURCES.txt
11
+ src/tea_data_file_conversion.egg-info/dependency_links.txt
12
+ src/tea_data_file_conversion.egg-info/entry_points.txt
13
+ src/tea_data_file_conversion.egg-info/requires.txt
14
+ src/tea_data_file_conversion.egg-info/top_level.txt
15
+ tests/test_cli.py
16
+ tests/test_processor.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ tea-data-file-conversion = tea_data_file_conversion.cli:app
@@ -0,0 +1,5 @@
1
+ importlib-resources>=6.5.2
2
+ pandas>=2.2.3
3
+ pyyaml>=6.0.2
4
+ rich>=10
5
+ typer<1,>=0.15
@@ -0,0 +1,86 @@
1
+ import sys
2
+
3
+ import pytest
4
+
5
+ # Import the cli module. Make sure your PYTHONPATH is set correctly.
6
+ from tea_data_file_conversion import cli
7
+
8
+
9
+ # Define dummy functions to replace export_templates and process_file.
10
+ def dummy_export_templates(schema_folder):
11
+ dummy_export_templates.called = True
12
+ dummy_export_templates.schema_folder = schema_folder
13
+
14
+
15
+ dummy_export_templates.called = False
16
+ dummy_export_templates.schema_folder = None
17
+
18
+
19
+ def dummy_process_file(input_file, output_file, schema_folder):
20
+ dummy_process_file.called = True
21
+ dummy_process_file.args = (input_file, output_file, schema_folder)
22
+
23
+
24
+ dummy_process_file.called = False
25
+ dummy_process_file.args = None
26
+
27
+
28
+ # A fixture to patch the functions in the cli module before each test.
29
+ @pytest.fixture(autouse=True)
30
+ def patch_cli_functions(monkeypatch):
31
+ monkeypatch.setattr(cli, "export_templates", dummy_export_templates)
32
+ monkeypatch.setattr(cli, "process_file", dummy_process_file)
33
+ # Reset our dummy function flags
34
+ dummy_export_templates.called = False
35
+ dummy_export_templates.schema_folder = None
36
+ dummy_process_file.called = False
37
+ dummy_process_file.args = None
38
+
39
+
40
+ # Test running the CLI without the --export_templates flag.
41
+ def test_main_without_export_templates():
42
+ test_input = "dummy_input.txt"
43
+ test_output = "dummy_output.csv"
44
+ test_schema = "dummy_schema"
45
+ sys.argv = [
46
+ "cli.py",
47
+ test_input,
48
+ "--output_file",
49
+ test_output,
50
+ "--schema_folder",
51
+ test_schema,
52
+ ]
53
+ cli.main()
54
+ # export_templates should NOT be called.
55
+ assert not dummy_export_templates.called
56
+ # process_file should be called with the provided values.
57
+ assert dummy_process_file.called
58
+ assert dummy_process_file.args == (test_input, test_output, test_schema)
59
+
60
+
61
+ # Test running the CLI when --export_templates flag is provided.
62
+ def test_main_with_export_templates():
63
+ test_input = "dummy_input.txt"
64
+ test_schema = "dummy_schema"
65
+ sys.argv = [
66
+ "cli.py",
67
+ test_input,
68
+ "--schema_folder",
69
+ test_schema,
70
+ "--export_templates",
71
+ ]
72
+ cli.main()
73
+ # When --export_templates is provided, export_templates should be called.
74
+ assert dummy_export_templates.called
75
+ assert dummy_export_templates.schema_folder == test_schema
76
+ # process_file is always called after the if-statement.
77
+ assert dummy_process_file.called
78
+ # Here output_file is not provided so it defaults to None.
79
+ assert dummy_process_file.args == (test_input, None, test_schema)
80
+
81
+
82
+ # Test that missing the required input_file argument causes a SystemExit.
83
+ def test_main_missing_input_file():
84
+ sys.argv = ["cli.py"]
85
+ with pytest.raises(SystemExit):
86
+ cli.main()
@@ -0,0 +1,124 @@
1
+ import os
2
+
3
+ import pandas as pd
4
+ import pytest
5
+
6
+ from tea_data_file_conversion.processor import (
7
+ csv_to_schema_yaml,
8
+ load_yaml_config,
9
+ process_file,
10
+ process_fixed_width_file,
11
+ validate_yaml_config,
12
+ )
13
+
14
+ # Existing tests remain the same...
15
+
16
+
17
+ def test_validate_yaml_config_valid():
18
+ valid_config = {"fields": [{"start": 1, "end": 5, "output_field": "field1", "keep": True}]}
19
+ validate_yaml_config(valid_config, "test.yaml") # Should not raise
20
+
21
+
22
+ def test_validate_yaml_config_invalid_cases():
23
+ cases = [
24
+ ({}, "missing fields key"),
25
+ ({"fields": {}}, "fields not a list"),
26
+ ({"fields": [{"invalid": "field"}]}, "missing required keys"),
27
+ ({"fields": [{"start": "1", "end": 5, "output_field": "field1"}]}, "start not int"),
28
+ ({"fields": [{"start": 1, "end": "5", "output_field": "field1"}]}, "end not int"),
29
+ ({"fields": [{"start": 1, "end": 5, "output_field": 1}]}, "output_field not str"),
30
+ ({"fields": [{"start": 1, "end": 5, "output_field": "field1", "keep": "true"}]}, "keep not bool"),
31
+ ]
32
+
33
+ for config, _ in cases:
34
+ with pytest.raises(ValueError):
35
+ validate_yaml_config(config, "test.yaml")
36
+
37
+
38
+ def test_process_fixed_width_file(tmp_path):
39
+ # Create a test fixed-width file
40
+ input_data = "ABC123\nDEF456"
41
+ input_file = tmp_path / "test.txt"
42
+ input_file.write_text(input_data)
43
+
44
+ config = {
45
+ "fields": [
46
+ {
47
+ "start": 1,
48
+ "end": 3,
49
+ "output_field": "letters",
50
+ "keep": True,
51
+ "mapped_field_name": "letters_mapped", # Added mapped field name
52
+ },
53
+ {
54
+ "start": 4,
55
+ "end": 6,
56
+ "output_field": "numbers",
57
+ "keep": False,
58
+ "mapped_field_name": "numbers_mapped", # Added mapped field name
59
+ },
60
+ ]
61
+ }
62
+
63
+ # Test with filter_columns=True
64
+ df = process_fixed_width_file(str(input_file), config, filter_columns=True)
65
+ assert list(df.columns) == ["letters_mapped"] # Updated assertion to use mapped name
66
+
67
+ # Test with filter_columns=False
68
+ df = process_fixed_width_file(str(input_file), config, filter_columns=False)
69
+ assert list(df.columns) == ["letters", "numbers"]
70
+
71
+
72
+ def test_process_file_integration(tmp_path):
73
+ # Create test input file
74
+ input_data = "0224ABC123\nDEF456789"
75
+ input_file = tmp_path / "test.txt"
76
+ input_file.write_text(input_data)
77
+
78
+ # Create test schema folder and file
79
+ schema_folder = tmp_path / "schemas"
80
+ schema_folder.mkdir()
81
+ staar_folder = schema_folder / "staar"
82
+ staar_folder.mkdir()
83
+
84
+ schema_content = """
85
+ fields:
86
+ - start: 1
87
+ end: 3
88
+ output_field: "field1"
89
+ keep: true
90
+ - start: 4
91
+ end: 6
92
+ output_field: "field2"
93
+ keep: false
94
+ """
95
+ schema_file = staar_folder / "staar_2024.yaml"
96
+ schema_file.write_text(schema_content)
97
+
98
+ # Test processing
99
+ output_file = tmp_path / "output.csv"
100
+ df = process_file(str(input_file), str(output_file), schema_folder=str(schema_folder))
101
+ assert os.path.exists(output_file)
102
+ assert isinstance(df, pd.DataFrame)
103
+
104
+
105
+ def test_csv_to_schema_yaml(tmp_path, monkeypatch):
106
+ # Create test CSV
107
+ csv_content = "start,end,field_name\n1,5,Field A\n6,10,Field B"
108
+ csv_file = tmp_path / "test.csv"
109
+ csv_file.write_text(csv_content)
110
+
111
+ # Mock input function
112
+ inputs = ["start", "end", "field_name"]
113
+ input_iter = iter(inputs)
114
+ monkeypatch.setattr("builtins.input", lambda _: next(input_iter))
115
+
116
+ # Test conversion
117
+ yaml_output = tmp_path / "output.yaml"
118
+ csv_to_schema_yaml(str(csv_file), str(yaml_output))
119
+ assert yaml_output.exists()
120
+
121
+ # Verify the generated YAML
122
+ config = load_yaml_config(str(yaml_output))
123
+ assert "fields" in config
124
+ assert len(config["fields"]) == 2