resumeminer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resumeminer-0.1.0/.gitignore +8 -0
- resumeminer-0.1.0/CHANGELOG.md +20 -0
- resumeminer-0.1.0/LICENSE +21 -0
- resumeminer-0.1.0/PKG-INFO +178 -0
- resumeminer-0.1.0/README.md +147 -0
- resumeminer-0.1.0/pyproject.toml +63 -0
- resumeminer-0.1.0/resumeminer/__init__.py +6 -0
- resumeminer-0.1.0/resumeminer/cli.py +87 -0
- resumeminer-0.1.0/resumeminer/data/__init__.py +0 -0
- resumeminer-0.1.0/resumeminer/data/skills.txt +16 -0
- resumeminer-0.1.0/resumeminer/extractors/__init__.py +5 -0
- resumeminer-0.1.0/resumeminer/extractors/pdf.py +75 -0
- resumeminer-0.1.0/resumeminer/parser.py +84 -0
- resumeminer-0.1.0/resumeminer/parsers/__init__.py +15 -0
- resumeminer-0.1.0/resumeminer/parsers/contact.py +51 -0
- resumeminer-0.1.0/resumeminer/parsers/links.py +87 -0
- resumeminer-0.1.0/resumeminer/parsers/skills.py +53 -0
- resumeminer-0.1.0/tests/conftest.py +88 -0
- resumeminer-0.1.0/tests/fixtures/README.md +37 -0
- resumeminer-0.1.0/tests/fixtures/awesome_cv_cv.pdf +0 -0
- resumeminer-0.1.0/tests/fixtures/awesome_cv_resume.pdf +0 -0
- resumeminer-0.1.0/tests/fixtures/software_engineer_resume.pdf +0 -0
- resumeminer-0.1.0/tests/test_cli.py +76 -0
- resumeminer-0.1.0/tests/test_contact.py +39 -0
- resumeminer-0.1.0/tests/test_extractors.py +51 -0
- resumeminer-0.1.0/tests/test_links.py +51 -0
- resumeminer-0.1.0/tests/test_parser.py +71 -0
- resumeminer-0.1.0/tests/test_real_resume_fixtures.py +36 -0
- resumeminer-0.1.0/tests/test_skills.py +46 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-06-18
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- PDF text extraction for resume files using `pypdf`
|
|
13
|
+
- Email and phone number extraction
|
|
14
|
+
- LinkedIn, GitHub, and portfolio URL extraction
|
|
15
|
+
- Configurable skill matching with a bundled default skills list
|
|
16
|
+
- `ResumeParser` class and `parse_resume()` convenience function
|
|
17
|
+
- CLI: `resumeminer parse <file>` with human-readable and `--json` output
|
|
18
|
+
- Python 3.9+ support
|
|
19
|
+
|
|
20
|
+
[0.1.0]: https://github.com/alixaprodev/resumeminer/releases/tag/v0.1.0
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 H. Ali
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: resumeminer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight Python library for extracting structured information from PDF resumes
|
|
5
|
+
Project-URL: Homepage, https://github.com/alixaprodev/resumeminer
|
|
6
|
+
Project-URL: Repository, https://github.com/alixaprodev/resumeminer
|
|
7
|
+
Project-URL: Issues, https://github.com/alixaprodev/resumeminer/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/alixaprodev/resumeminer#readme
|
|
9
|
+
Author-email: "H. Ali" <haxratali0@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: cv,parser,pdf,recruitment,resume
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Topic :: Text Processing
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Requires-Dist: pypdf<6,>=4.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# ResumeMiner
|
|
33
|
+
|
|
34
|
+
A lightweight Python library for extracting structured information from PDF resumes.
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- PDF text extraction via `pypdf`
|
|
39
|
+
- Email and phone number extraction
|
|
40
|
+
- LinkedIn, GitHub, and portfolio URL extraction
|
|
41
|
+
- Configurable skill matching against a bundled default list
|
|
42
|
+
- Simple API and command-line interface
|
|
43
|
+
- Structured dictionary output with raw extracted text
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install resumeminer
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Requires Python 3.9 or newer.
|
|
52
|
+
|
|
53
|
+
### Development
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
git clone https://github.com/alixaprodev/resumeminer.git
|
|
57
|
+
cd resumeminer
|
|
58
|
+
pip install -e ".[dev]"
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from resumeminer import ResumeParser
|
|
65
|
+
|
|
66
|
+
parser = ResumeParser("resume.pdf")
|
|
67
|
+
result = parser.parse()
|
|
68
|
+
|
|
69
|
+
print(result["email"])
|
|
70
|
+
print(result["skills"])
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from resumeminer import parse_resume
|
|
75
|
+
|
|
76
|
+
result = parse_resume("resume.pdf")
|
|
77
|
+
print(result)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Optional custom skills list:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
parser = ResumeParser("resume.pdf", skills=["Python", "Rust", "Go"])
|
|
84
|
+
result = parser.parse()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## CLI Usage
|
|
88
|
+
|
|
89
|
+
Parse a resume:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
resumeminer parse resume.pdf
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Human-readable output (default):
|
|
96
|
+
|
|
97
|
+
```text
|
|
98
|
+
Email: jane.developer@example.com
|
|
99
|
+
Phone: +1 (555) 123-4567
|
|
100
|
+
LinkedIn: https://linkedin.com/in/jane-developer
|
|
101
|
+
GitHub: https://github.com/janedev
|
|
102
|
+
Portfolio: https://janedeveloper.dev
|
|
103
|
+
Skills: Python, Django, React, Docker, AWS
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Full JSON output (includes `raw_text`):
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
resumeminer parse resume.pdf --json
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Print version:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
resumeminer --version
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Output Example
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
{
|
|
122
|
+
"email": "jane.developer@example.com",
|
|
123
|
+
"phone": "+1 (555) 123-4567",
|
|
124
|
+
"links": {
|
|
125
|
+
"linkedin": "https://linkedin.com/in/jane-developer",
|
|
126
|
+
"github": "https://github.com/janedev",
|
|
127
|
+
"portfolio": "https://janedeveloper.dev"
|
|
128
|
+
},
|
|
129
|
+
"skills": ["Python", "Django", "React", "Docker", "AWS"],
|
|
130
|
+
"raw_text": "..."
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Supported Files
|
|
135
|
+
|
|
136
|
+
- PDF resumes with extractable text
|
|
137
|
+
|
|
138
|
+
Scanned or image-only PDFs are not supported in v0.1.0.
|
|
139
|
+
|
|
140
|
+
## Limitations
|
|
141
|
+
|
|
142
|
+
- Extraction quality depends on PDF structure and formatting
|
|
143
|
+
- Regex-based parsing may miss or misread fields on unusual layouts
|
|
144
|
+
- Skill detection uses a fixed default list unless a custom list is provided
|
|
145
|
+
- Phone and portfolio URL extraction may return imperfect matches on some resumes
|
|
146
|
+
|
|
147
|
+
## Roadmap
|
|
148
|
+
|
|
149
|
+
- OCR support for scanned resumes
|
|
150
|
+
- DOCX and TXT file support
|
|
151
|
+
- Name, education, and experience extraction
|
|
152
|
+
- Section-based parsing
|
|
153
|
+
- Custom skill dictionary file path
|
|
154
|
+
- JSON schema output
|
|
155
|
+
- Batch parsing CLI
|
|
156
|
+
|
|
157
|
+
## Contributing
|
|
158
|
+
|
|
159
|
+
Contributions are welcome.
|
|
160
|
+
|
|
161
|
+
1. Fork [github.com/alixaprodev/resumeminer](https://github.com/alixaprodev/resumeminer)
|
|
162
|
+
2. Create a feature branch
|
|
163
|
+
3. Add tests for behavior changes
|
|
164
|
+
4. Run `pytest`
|
|
165
|
+
5. Open a pull request
|
|
166
|
+
|
|
167
|
+
Report issues on [GitHub Issues](https://github.com/alixaprodev/resumeminer/issues).
|
|
168
|
+
|
|
169
|
+
## License
|
|
170
|
+
|
|
171
|
+
MIT License. See [LICENSE](LICENSE).
|
|
172
|
+
|
|
173
|
+
## Author
|
|
174
|
+
|
|
175
|
+
**H. Ali**
|
|
176
|
+
|
|
177
|
+
- GitHub: [github.com/alixaprodev](https://github.com/alixaprodev)
|
|
178
|
+
- Email: [haxratali0@gmail.com](mailto:haxratali0@gmail.com)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# ResumeMiner
|
|
2
|
+
|
|
3
|
+
A lightweight Python library for extracting structured information from PDF resumes.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- PDF text extraction via `pypdf`
|
|
8
|
+
- Email and phone number extraction
|
|
9
|
+
- LinkedIn, GitHub, and portfolio URL extraction
|
|
10
|
+
- Configurable skill matching against a bundled default list
|
|
11
|
+
- Simple API and command-line interface
|
|
12
|
+
- Structured dictionary output with raw extracted text
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install resumeminer
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Requires Python 3.9 or newer.
|
|
21
|
+
|
|
22
|
+
### Development
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/alixaprodev/resumeminer.git
|
|
26
|
+
cd resumeminer
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from resumeminer import ResumeParser
|
|
34
|
+
|
|
35
|
+
parser = ResumeParser("resume.pdf")
|
|
36
|
+
result = parser.parse()
|
|
37
|
+
|
|
38
|
+
print(result["email"])
|
|
39
|
+
print(result["skills"])
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from resumeminer import parse_resume
|
|
44
|
+
|
|
45
|
+
result = parse_resume("resume.pdf")
|
|
46
|
+
print(result)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Optional custom skills list:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
parser = ResumeParser("resume.pdf", skills=["Python", "Rust", "Go"])
|
|
53
|
+
result = parser.parse()
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## CLI Usage
|
|
57
|
+
|
|
58
|
+
Parse a resume:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
resumeminer parse resume.pdf
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Human-readable output (default):
|
|
65
|
+
|
|
66
|
+
```text
|
|
67
|
+
Email: jane.developer@example.com
|
|
68
|
+
Phone: +1 (555) 123-4567
|
|
69
|
+
LinkedIn: https://linkedin.com/in/jane-developer
|
|
70
|
+
GitHub: https://github.com/janedev
|
|
71
|
+
Portfolio: https://janedeveloper.dev
|
|
72
|
+
Skills: Python, Django, React, Docker, AWS
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Full JSON output (includes `raw_text`):
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
resumeminer parse resume.pdf --json
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Print version:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
resumeminer --version
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Output Example
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
{
|
|
91
|
+
"email": "jane.developer@example.com",
|
|
92
|
+
"phone": "+1 (555) 123-4567",
|
|
93
|
+
"links": {
|
|
94
|
+
"linkedin": "https://linkedin.com/in/jane-developer",
|
|
95
|
+
"github": "https://github.com/janedev",
|
|
96
|
+
"portfolio": "https://janedeveloper.dev"
|
|
97
|
+
},
|
|
98
|
+
"skills": ["Python", "Django", "React", "Docker", "AWS"],
|
|
99
|
+
"raw_text": "..."
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Supported Files
|
|
104
|
+
|
|
105
|
+
- PDF resumes with extractable text
|
|
106
|
+
|
|
107
|
+
Scanned or image-only PDFs are not supported in v0.1.0.
|
|
108
|
+
|
|
109
|
+
## Limitations
|
|
110
|
+
|
|
111
|
+
- Extraction quality depends on PDF structure and formatting
|
|
112
|
+
- Regex-based parsing may miss or misread fields on unusual layouts
|
|
113
|
+
- Skill detection uses a fixed default list unless a custom list is provided
|
|
114
|
+
- Phone and portfolio URL extraction may return imperfect matches on some resumes
|
|
115
|
+
|
|
116
|
+
## Roadmap
|
|
117
|
+
|
|
118
|
+
- OCR support for scanned resumes
|
|
119
|
+
- DOCX and TXT file support
|
|
120
|
+
- Name, education, and experience extraction
|
|
121
|
+
- Section-based parsing
|
|
122
|
+
- Custom skill dictionary file path
|
|
123
|
+
- JSON schema output
|
|
124
|
+
- Batch parsing CLI
|
|
125
|
+
|
|
126
|
+
## Contributing
|
|
127
|
+
|
|
128
|
+
Contributions are welcome.
|
|
129
|
+
|
|
130
|
+
1. Fork [github.com/alixaprodev/resumeminer](https://github.com/alixaprodev/resumeminer)
|
|
131
|
+
2. Create a feature branch
|
|
132
|
+
3. Add tests for behavior changes
|
|
133
|
+
4. Run `pytest`
|
|
134
|
+
5. Open a pull request
|
|
135
|
+
|
|
136
|
+
Report issues on [GitHub Issues](https://github.com/alixaprodev/resumeminer/issues).
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT License. See [LICENSE](LICENSE).
|
|
141
|
+
|
|
142
|
+
## Author
|
|
143
|
+
|
|
144
|
+
**H. Ali**
|
|
145
|
+
|
|
146
|
+
- GitHub: [github.com/alixaprodev](https://github.com/alixaprodev)
|
|
147
|
+
- Email: [haxratali0@gmail.com](mailto:haxratali0@gmail.com)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "resumeminer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A lightweight Python library for extracting structured information from PDF resumes"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
license-files = ["LICENSE"]
|
|
12
|
+
requires-python = ">=3.9"
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "H. Ali", email = "haxratali0@gmail.com" },
|
|
15
|
+
]
|
|
16
|
+
keywords = ["resume", "parser", "pdf", "cv", "recruitment"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
28
|
+
"Topic :: Text Processing",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"pypdf>=4.0,<6",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=7.0",
|
|
37
|
+
"build>=1.0",
|
|
38
|
+
"twine>=5.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
resumeminer = "resumeminer.cli:main"
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/alixaprodev/resumeminer"
|
|
46
|
+
Repository = "https://github.com/alixaprodev/resumeminer"
|
|
47
|
+
Issues = "https://github.com/alixaprodev/resumeminer/issues"
|
|
48
|
+
Documentation = "https://github.com/alixaprodev/resumeminer#readme"
|
|
49
|
+
|
|
50
|
+
[tool.hatch.build.targets.wheel]
|
|
51
|
+
packages = ["resumeminer"]
|
|
52
|
+
|
|
53
|
+
[tool.hatch.build.targets.sdist]
|
|
54
|
+
include = [
|
|
55
|
+
"/resumeminer",
|
|
56
|
+
"/tests",
|
|
57
|
+
"/README.md",
|
|
58
|
+
"/LICENSE",
|
|
59
|
+
"/CHANGELOG.md",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[tool.pytest.ini_options]
|
|
63
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Command-line interface for ResumeMiner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from resumeminer import __version__
|
|
12
|
+
from resumeminer.extractors.pdf import PDFExtractionError
|
|
13
|
+
from resumeminer.parser import parse_resume
|
|
14
|
+
|
|
15
|
+
ParseResult = dict[str, Any]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
19
|
+
"""Build the CLI argument parser."""
|
|
20
|
+
parser = argparse.ArgumentParser(
|
|
21
|
+
prog="resumeminer",
|
|
22
|
+
description="A lightweight Python library for extracting structured information from PDF resumes.",
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--version",
|
|
26
|
+
action="version",
|
|
27
|
+
version=f"resumeminer {__version__}",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
31
|
+
|
|
32
|
+
parse_cmd = subparsers.add_parser("parse", help="Parse a PDF resume")
|
|
33
|
+
parse_cmd.add_argument("file", type=Path, help="Path to the PDF resume")
|
|
34
|
+
parse_cmd.add_argument(
|
|
35
|
+
"--json",
|
|
36
|
+
action="store_true",
|
|
37
|
+
help="Print full structured output as JSON (includes raw_text)",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
return parser
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def format_result(result: ParseResult) -> str:
|
|
44
|
+
"""Format parse result as human-readable text."""
|
|
45
|
+
skills = ", ".join(result["skills"]) if result["skills"] else "—"
|
|
46
|
+
lines = [
|
|
47
|
+
f"Email: {result['email'] or '—'}",
|
|
48
|
+
f"Phone: {result['phone'] or '—'}",
|
|
49
|
+
f"LinkedIn: {result['links']['linkedin'] or '—'}",
|
|
50
|
+
f"GitHub: {result['links']['github'] or '—'}",
|
|
51
|
+
f"Portfolio: {result['links']['portfolio'] or '—'}",
|
|
52
|
+
f"Skills: {skills}",
|
|
53
|
+
]
|
|
54
|
+
return "\n".join(lines)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def main(argv: list[str] | None = None) -> int:
|
|
58
|
+
"""Entry point for the resumeminer CLI."""
|
|
59
|
+
parser = build_parser()
|
|
60
|
+
args = parser.parse_args(argv)
|
|
61
|
+
|
|
62
|
+
if args.command == "parse":
|
|
63
|
+
if not args.file.exists():
|
|
64
|
+
print(f"Error: file not found: {args.file}", file=sys.stderr)
|
|
65
|
+
return 1
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
result = parse_resume(args.file)
|
|
69
|
+
except PDFExtractionError as exc:
|
|
70
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
71
|
+
return 1
|
|
72
|
+
except FileNotFoundError as exc:
|
|
73
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
74
|
+
return 1
|
|
75
|
+
|
|
76
|
+
if args.json:
|
|
77
|
+
print(json.dumps(result, indent=2))
|
|
78
|
+
else:
|
|
79
|
+
print(format_result(result))
|
|
80
|
+
|
|
81
|
+
return 0
|
|
82
|
+
|
|
83
|
+
return 1
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
raise SystemExit(main())
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""PDF text extraction using pypdf."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
from pypdf import PdfReader
|
|
9
|
+
from pypdf.errors import PdfReadError
|
|
10
|
+
|
|
11
|
+
PathLike = Union[str, Path]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PDFExtractionError(Exception):
|
|
15
|
+
"""Raised when PDF text extraction fails."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _normalize_text(text: str) -> str:
|
|
19
|
+
"""Collapse excessive whitespace while preserving line breaks."""
|
|
20
|
+
lines = [line.strip() for line in text.splitlines()]
|
|
21
|
+
cleaned = "\n".join(line for line in lines if line)
|
|
22
|
+
return cleaned.strip()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_text_from_pdf(file_path: PathLike) -> str:
|
|
26
|
+
"""Extract text from all pages of a PDF file.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
file_path: Path to the PDF file.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Extracted and cleaned text, or an empty string if no text is found.
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
FileNotFoundError: If the file does not exist.
|
|
36
|
+
PDFExtractionError: If the PDF cannot be read or parsed.
|
|
37
|
+
"""
|
|
38
|
+
path = Path(file_path)
|
|
39
|
+
if not path.exists():
|
|
40
|
+
raise FileNotFoundError(f"PDF file not found: {path}")
|
|
41
|
+
|
|
42
|
+
if path.suffix.lower() != ".pdf":
|
|
43
|
+
raise PDFExtractionError(f"Expected a PDF file, got: {path.suffix}")
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
reader = PdfReader(str(path))
|
|
47
|
+
except PdfReadError as exc:
|
|
48
|
+
raise PDFExtractionError(f"Failed to read PDF: {path}") from exc
|
|
49
|
+
except Exception as exc:
|
|
50
|
+
raise PDFExtractionError(f"Failed to open PDF: {path}") from exc
|
|
51
|
+
|
|
52
|
+
if reader.is_encrypted:
|
|
53
|
+
try:
|
|
54
|
+
reader.decrypt("")
|
|
55
|
+
except Exception as exc:
|
|
56
|
+
raise PDFExtractionError(f"PDF is encrypted and cannot be read: {path}") from exc
|
|
57
|
+
|
|
58
|
+
page_texts: list[str] = []
|
|
59
|
+
for page in reader.pages:
|
|
60
|
+
try:
|
|
61
|
+
page_text = page.extract_text() or ""
|
|
62
|
+
except Exception:
|
|
63
|
+
page_text = ""
|
|
64
|
+
if page_text.strip():
|
|
65
|
+
page_texts.append(page_text)
|
|
66
|
+
|
|
67
|
+
return _normalize_text("\n".join(page_texts))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class PDFExtractor:
|
|
71
|
+
"""Extract text from PDF resume files."""
|
|
72
|
+
|
|
73
|
+
def extract(self, file_path: PathLike) -> str:
|
|
74
|
+
"""Extract text from a PDF file."""
|
|
75
|
+
return extract_text_from_pdf(file_path)
|