resumeminer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .pytest_cache/
8
+ *.egg
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2026-06-18
9
+
10
+ ### Added
11
+
12
+ - PDF text extraction for resume files using `pypdf`
13
+ - Email and phone number extraction
14
+ - LinkedIn, GitHub, and portfolio URL extraction
15
+ - Configurable skill matching with a bundled default skills list
16
+ - `ResumeParser` class and `parse_resume()` convenience function
17
+ - CLI: `resumeminer parse <file>` with human-readable and `--json` output
18
+ - Python 3.9+ support
19
+
20
+ [0.1.0]: https://github.com/alixaprodev/resumeminer/releases/tag/v0.1.0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 H. Ali
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: resumeminer
3
+ Version: 0.1.0
4
+ Summary: A lightweight Python library for extracting structured information from PDF resumes
5
+ Project-URL: Homepage, https://github.com/alixaprodev/resumeminer
6
+ Project-URL: Repository, https://github.com/alixaprodev/resumeminer
7
+ Project-URL: Issues, https://github.com/alixaprodev/resumeminer/issues
8
+ Project-URL: Documentation, https://github.com/alixaprodev/resumeminer#readme
9
+ Author-email: "H. Ali" <haxratali0@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: cv,parser,pdf,recruitment,resume
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Topic :: Text Processing
24
+ Requires-Python: >=3.9
25
+ Requires-Dist: pypdf<6,>=4.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: build>=1.0; extra == 'dev'
28
+ Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Requires-Dist: twine>=5.0; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # ResumeMiner
33
+
34
+ A lightweight Python library for extracting structured information from PDF resumes.
35
+
36
+ ## Features
37
+
38
+ - PDF text extraction via `pypdf`
39
+ - Email and phone number extraction
40
+ - LinkedIn, GitHub, and portfolio URL extraction
41
+ - Configurable skill matching against a bundled default list
42
+ - Simple API and command-line interface
43
+ - Structured dictionary output with raw extracted text
44
+
45
+ ## Installation
46
+
47
+ ```bash
48
+ pip install resumeminer
49
+ ```
50
+
51
+ Requires Python 3.9 or newer.
52
+
53
+ ### Development
54
+
55
+ ```bash
56
+ git clone https://github.com/alixaprodev/resumeminer.git
57
+ cd resumeminer
58
+ pip install -e ".[dev]"
59
+ ```
60
+
61
+ ## Quick Start
62
+
63
+ ```python
64
+ from resumeminer import ResumeParser
65
+
66
+ parser = ResumeParser("resume.pdf")
67
+ result = parser.parse()
68
+
69
+ print(result["email"])
70
+ print(result["skills"])
71
+ ```
72
+
73
+ ```python
74
+ from resumeminer import parse_resume
75
+
76
+ result = parse_resume("resume.pdf")
77
+ print(result)
78
+ ```
79
+
80
+ Optional custom skills list:
81
+
82
+ ```python
83
+ parser = ResumeParser("resume.pdf", skills=["Python", "Rust", "Go"])
84
+ result = parser.parse()
85
+ ```
86
+
87
+ ## CLI Usage
88
+
89
+ Parse a resume:
90
+
91
+ ```bash
92
+ resumeminer parse resume.pdf
93
+ ```
94
+
95
+ Human-readable output (default):
96
+
97
+ ```text
98
+ Email: jane.developer@example.com
99
+ Phone: +1 (555) 123-4567
100
+ LinkedIn: https://linkedin.com/in/jane-developer
101
+ GitHub: https://github.com/janedev
102
+ Portfolio: https://janedeveloper.dev
103
+ Skills: Python, Django, React, Docker, AWS
104
+ ```
105
+
106
+ Full JSON output (includes `raw_text`):
107
+
108
+ ```bash
109
+ resumeminer parse resume.pdf --json
110
+ ```
111
+
112
+ Print version:
113
+
114
+ ```bash
115
+ resumeminer --version
116
+ ```
117
+
118
+ ## Output Example
119
+
120
+ ```python
121
+ {
122
+ "email": "jane.developer@example.com",
123
+ "phone": "+1 (555) 123-4567",
124
+ "links": {
125
+ "linkedin": "https://linkedin.com/in/jane-developer",
126
+ "github": "https://github.com/janedev",
127
+ "portfolio": "https://janedeveloper.dev"
128
+ },
129
+ "skills": ["Python", "Django", "React", "Docker", "AWS"],
130
+ "raw_text": "..."
131
+ }
132
+ ```
133
+
134
+ ## Supported Files
135
+
136
+ - PDF resumes with extractable text
137
+
138
+ Scanned or image-only PDFs are not supported in v0.1.0.
139
+
140
+ ## Limitations
141
+
142
+ - Extraction quality depends on PDF structure and formatting
143
+ - Regex-based parsing may miss or misread fields on unusual layouts
144
+ - Skill detection uses a fixed default list unless a custom list is provided
145
+ - Phone and portfolio URL extraction may return imperfect matches on some resumes
146
+
147
+ ## Roadmap
148
+
149
+ - OCR support for scanned resumes
150
+ - DOCX and TXT file support
151
+ - Name, education, and experience extraction
152
+ - Section-based parsing
153
+ - Custom skill dictionary file path
154
+ - JSON schema output
155
+ - Batch parsing CLI
156
+
157
+ ## Contributing
158
+
159
+ Contributions are welcome.
160
+
161
+ 1. Fork [github.com/alixaprodev/resumeminer](https://github.com/alixaprodev/resumeminer)
162
+ 2. Create a feature branch
163
+ 3. Add tests for behavior changes
164
+ 4. Run `pytest`
165
+ 5. Open a pull request
166
+
167
+ Report issues on [GitHub Issues](https://github.com/alixaprodev/resumeminer/issues).
168
+
169
+ ## License
170
+
171
+ MIT License. See [LICENSE](LICENSE).
172
+
173
+ ## Author
174
+
175
+ **H. Ali**
176
+
177
+ - GitHub: [github.com/alixaprodev](https://github.com/alixaprodev)
178
+ - Email: [haxratali0@gmail.com](mailto:haxratali0@gmail.com)
@@ -0,0 +1,147 @@
1
+ # ResumeMiner
2
+
3
+ A lightweight Python library for extracting structured information from PDF resumes.
4
+
5
+ ## Features
6
+
7
+ - PDF text extraction via `pypdf`
8
+ - Email and phone number extraction
9
+ - LinkedIn, GitHub, and portfolio URL extraction
10
+ - Configurable skill matching against a bundled default list
11
+ - Simple API and command-line interface
12
+ - Structured dictionary output with raw extracted text
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install resumeminer
18
+ ```
19
+
20
+ Requires Python 3.9 or newer.
21
+
22
+ ### Development
23
+
24
+ ```bash
25
+ git clone https://github.com/alixaprodev/resumeminer.git
26
+ cd resumeminer
27
+ pip install -e ".[dev]"
28
+ ```
29
+
30
+ ## Quick Start
31
+
32
+ ```python
33
+ from resumeminer import ResumeParser
34
+
35
+ parser = ResumeParser("resume.pdf")
36
+ result = parser.parse()
37
+
38
+ print(result["email"])
39
+ print(result["skills"])
40
+ ```
41
+
42
+ ```python
43
+ from resumeminer import parse_resume
44
+
45
+ result = parse_resume("resume.pdf")
46
+ print(result)
47
+ ```
48
+
49
+ Optional custom skills list:
50
+
51
+ ```python
52
+ parser = ResumeParser("resume.pdf", skills=["Python", "Rust", "Go"])
53
+ result = parser.parse()
54
+ ```
55
+
56
+ ## CLI Usage
57
+
58
+ Parse a resume:
59
+
60
+ ```bash
61
+ resumeminer parse resume.pdf
62
+ ```
63
+
64
+ Human-readable output (default):
65
+
66
+ ```text
67
+ Email: jane.developer@example.com
68
+ Phone: +1 (555) 123-4567
69
+ LinkedIn: https://linkedin.com/in/jane-developer
70
+ GitHub: https://github.com/janedev
71
+ Portfolio: https://janedeveloper.dev
72
+ Skills: Python, Django, React, Docker, AWS
73
+ ```
74
+
75
+ Full JSON output (includes `raw_text`):
76
+
77
+ ```bash
78
+ resumeminer parse resume.pdf --json
79
+ ```
80
+
81
+ Print version:
82
+
83
+ ```bash
84
+ resumeminer --version
85
+ ```
86
+
87
+ ## Output Example
88
+
89
+ ```python
90
+ {
91
+ "email": "jane.developer@example.com",
92
+ "phone": "+1 (555) 123-4567",
93
+ "links": {
94
+ "linkedin": "https://linkedin.com/in/jane-developer",
95
+ "github": "https://github.com/janedev",
96
+ "portfolio": "https://janedeveloper.dev"
97
+ },
98
+ "skills": ["Python", "Django", "React", "Docker", "AWS"],
99
+ "raw_text": "..."
100
+ }
101
+ ```
102
+
103
+ ## Supported Files
104
+
105
+ - PDF resumes with extractable text
106
+
107
+ Scanned or image-only PDFs are not supported in v0.1.0.
108
+
109
+ ## Limitations
110
+
111
+ - Extraction quality depends on PDF structure and formatting
112
+ - Regex-based parsing may miss or misread fields on unusual layouts
113
+ - Skill detection uses a fixed default list unless a custom list is provided
114
+ - Phone and portfolio URL extraction may return imperfect matches on some resumes
115
+
116
+ ## Roadmap
117
+
118
+ - OCR support for scanned resumes
119
+ - DOCX and TXT file support
120
+ - Name, education, and experience extraction
121
+ - Section-based parsing
122
+ - Custom skill dictionary file path
123
+ - JSON schema output
124
+ - Batch parsing CLI
125
+
126
+ ## Contributing
127
+
128
+ Contributions are welcome.
129
+
130
+ 1. Fork [github.com/alixaprodev/resumeminer](https://github.com/alixaprodev/resumeminer)
131
+ 2. Create a feature branch
132
+ 3. Add tests for behavior changes
133
+ 4. Run `pytest`
134
+ 5. Open a pull request
135
+
136
+ Report issues on [GitHub Issues](https://github.com/alixaprodev/resumeminer/issues).
137
+
138
+ ## License
139
+
140
+ MIT License. See [LICENSE](LICENSE).
141
+
142
+ ## Author
143
+
144
+ **H. Ali**
145
+
146
+ - GitHub: [github.com/alixaprodev](https://github.com/alixaprodev)
147
+ - Email: [haxratali0@gmail.com](mailto:haxratali0@gmail.com)
@@ -0,0 +1,63 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "resumeminer"
7
+ version = "0.1.0"
8
+ description = "A lightweight Python library for extracting structured information from PDF resumes"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ license-files = ["LICENSE"]
12
+ requires-python = ">=3.9"
13
+ authors = [
14
+ { name = "H. Ali", email = "haxratali0@gmail.com" },
15
+ ]
16
+ keywords = ["resume", "parser", "pdf", "cv", "recruitment"]
17
+ classifiers = [
18
+ "Development Status :: 4 - Beta",
19
+ "Intended Audience :: Developers",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Software Development :: Libraries :: Python Modules",
28
+ "Topic :: Text Processing",
29
+ ]
30
+ dependencies = [
31
+ "pypdf>=4.0,<6",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "pytest>=7.0",
37
+ "build>=1.0",
38
+ "twine>=5.0",
39
+ ]
40
+
41
+ [project.scripts]
42
+ resumeminer = "resumeminer.cli:main"
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/alixaprodev/resumeminer"
46
+ Repository = "https://github.com/alixaprodev/resumeminer"
47
+ Issues = "https://github.com/alixaprodev/resumeminer/issues"
48
+ Documentation = "https://github.com/alixaprodev/resumeminer#readme"
49
+
50
+ [tool.hatch.build.targets.wheel]
51
+ packages = ["resumeminer"]
52
+
53
+ [tool.hatch.build.targets.sdist]
54
+ include = [
55
+ "/resumeminer",
56
+ "/tests",
57
+ "/README.md",
58
+ "/LICENSE",
59
+ "/CHANGELOG.md",
60
+ ]
61
+
62
+ [tool.pytest.ini_options]
63
+ testpaths = ["tests"]
@@ -0,0 +1,6 @@
1
+ """A lightweight Python library for extracting structured information from PDF resumes."""
2
+
3
+ from resumeminer.parser import ResumeParser, parse_resume
4
+
5
+ __all__ = ["ResumeParser", "parse_resume", "__version__"]
6
+ __version__ = "0.1.0"
@@ -0,0 +1,87 @@
1
+ """Command-line interface for ResumeMiner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from resumeminer import __version__
12
+ from resumeminer.extractors.pdf import PDFExtractionError
13
+ from resumeminer.parser import parse_resume
14
+
15
+ ParseResult = dict[str, Any]
16
+
17
+
18
+ def build_parser() -> argparse.ArgumentParser:
19
+ """Build the CLI argument parser."""
20
+ parser = argparse.ArgumentParser(
21
+ prog="resumeminer",
22
+ description="A lightweight Python library for extracting structured information from PDF resumes.",
23
+ )
24
+ parser.add_argument(
25
+ "--version",
26
+ action="version",
27
+ version=f"resumeminer {__version__}",
28
+ )
29
+
30
+ subparsers = parser.add_subparsers(dest="command", required=True)
31
+
32
+ parse_cmd = subparsers.add_parser("parse", help="Parse a PDF resume")
33
+ parse_cmd.add_argument("file", type=Path, help="Path to the PDF resume")
34
+ parse_cmd.add_argument(
35
+ "--json",
36
+ action="store_true",
37
+ help="Print full structured output as JSON (includes raw_text)",
38
+ )
39
+
40
+ return parser
41
+
42
+
43
+ def format_result(result: ParseResult) -> str:
44
+ """Format parse result as human-readable text."""
45
+ skills = ", ".join(result["skills"]) if result["skills"] else "—"
46
+ lines = [
47
+ f"Email: {result['email'] or '—'}",
48
+ f"Phone: {result['phone'] or '—'}",
49
+ f"LinkedIn: {result['links']['linkedin'] or '—'}",
50
+ f"GitHub: {result['links']['github'] or '—'}",
51
+ f"Portfolio: {result['links']['portfolio'] or '—'}",
52
+ f"Skills: {skills}",
53
+ ]
54
+ return "\n".join(lines)
55
+
56
+
57
+ def main(argv: list[str] | None = None) -> int:
58
+ """Entry point for the resumeminer CLI."""
59
+ parser = build_parser()
60
+ args = parser.parse_args(argv)
61
+
62
+ if args.command == "parse":
63
+ if not args.file.exists():
64
+ print(f"Error: file not found: {args.file}", file=sys.stderr)
65
+ return 1
66
+
67
+ try:
68
+ result = parse_resume(args.file)
69
+ except PDFExtractionError as exc:
70
+ print(f"Error: {exc}", file=sys.stderr)
71
+ return 1
72
+ except FileNotFoundError as exc:
73
+ print(f"Error: {exc}", file=sys.stderr)
74
+ return 1
75
+
76
+ if args.json:
77
+ print(json.dumps(result, indent=2))
78
+ else:
79
+ print(format_result(result))
80
+
81
+ return 0
82
+
83
+ return 1
84
+
85
+
86
+ if __name__ == "__main__":
87
+ raise SystemExit(main())
File without changes
@@ -0,0 +1,16 @@
1
+ Python
2
+ Django
3
+ Flask
4
+ FastAPI
5
+ JavaScript
6
+ TypeScript
7
+ React
8
+ Node.js
9
+ PostgreSQL
10
+ MySQL
11
+ MongoDB
12
+ Docker
13
+ Kubernetes
14
+ AWS
15
+ Git
16
+ Linux
@@ -0,0 +1,5 @@
1
+ """Text extraction from resume files."""
2
+
3
+ from resumeminer.extractors.pdf import PDFExtractor, extract_text_from_pdf
4
+
5
+ __all__ = ["PDFExtractor", "extract_text_from_pdf"]
@@ -0,0 +1,75 @@
1
+ """PDF text extraction using pypdf."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Union
7
+
8
+ from pypdf import PdfReader
9
+ from pypdf.errors import PdfReadError
10
+
11
+ PathLike = Union[str, Path]
12
+
13
+
14
+ class PDFExtractionError(Exception):
15
+ """Raised when PDF text extraction fails."""
16
+
17
+
18
+ def _normalize_text(text: str) -> str:
19
+ """Collapse excessive whitespace while preserving line breaks."""
20
+ lines = [line.strip() for line in text.splitlines()]
21
+ cleaned = "\n".join(line for line in lines if line)
22
+ return cleaned.strip()
23
+
24
+
25
+ def extract_text_from_pdf(file_path: PathLike) -> str:
26
+ """Extract text from all pages of a PDF file.
27
+
28
+ Args:
29
+ file_path: Path to the PDF file.
30
+
31
+ Returns:
32
+ Extracted and cleaned text, or an empty string if no text is found.
33
+
34
+ Raises:
35
+ FileNotFoundError: If the file does not exist.
36
+ PDFExtractionError: If the PDF cannot be read or parsed.
37
+ """
38
+ path = Path(file_path)
39
+ if not path.exists():
40
+ raise FileNotFoundError(f"PDF file not found: {path}")
41
+
42
+ if path.suffix.lower() != ".pdf":
43
+ raise PDFExtractionError(f"Expected a PDF file, got: {path.suffix}")
44
+
45
+ try:
46
+ reader = PdfReader(str(path))
47
+ except PdfReadError as exc:
48
+ raise PDFExtractionError(f"Failed to read PDF: {path}") from exc
49
+ except Exception as exc:
50
+ raise PDFExtractionError(f"Failed to open PDF: {path}") from exc
51
+
52
+ if reader.is_encrypted:
53
+ try:
54
+ reader.decrypt("")
55
+ except Exception as exc:
56
+ raise PDFExtractionError(f"PDF is encrypted and cannot be read: {path}") from exc
57
+
58
+ page_texts: list[str] = []
59
+ for page in reader.pages:
60
+ try:
61
+ page_text = page.extract_text() or ""
62
+ except Exception:
63
+ page_text = ""
64
+ if page_text.strip():
65
+ page_texts.append(page_text)
66
+
67
+ return _normalize_text("\n".join(page_texts))
68
+
69
+
70
+ class PDFExtractor:
71
+ """Extract text from PDF resume files."""
72
+
73
+ def extract(self, file_path: PathLike) -> str:
74
+ """Extract text from a PDF file."""
75
+ return extract_text_from_pdf(file_path)