cv-parser 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +76 -9
- data/lib/cv_parser/configuration.rb +1 -0
- data/lib/cv_parser/errors.rb +3 -0
- data/lib/cv_parser/extractor.rb +1 -0
- data/lib/cv_parser/pdf_converter.rb +1 -0
- data/lib/cv_parser/providers/anthropic.rb +49 -7
- data/lib/cv_parser/providers/base.rb +19 -3
- data/lib/cv_parser/providers/faker.rb +18 -3
- data/lib/cv_parser/providers/openai.rb +54 -15
- data/lib/cv_parser/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f19aafe2fe36f105d7c2d307d2717697e193326b9505e14d7f5bfe7be227ff9b
|
4
|
+
data.tar.gz: 1d82e243b702db581b1fdf68e2ff55ab03d1417d3058b4649e012f9210c05874
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97e6de02e6543085f46d03f77b5a64ee712c4da2bb30b3ac3d923d0aa9d820f9824a3a838226904901e6d5abfbadf51b28b576b08f437d1b346aac37321d8444
|
7
|
+
data.tar.gz: ea5efd136b8cbbde4eb96694d164ae60295807017129434886fdc57708ec5ebd72a8a4a807d72c4732289467207b315564c97f51bc9b2939ce407ef7fac176b4
|
data/README.md
CHANGED
@@ -3,12 +3,14 @@
|
|
3
3
|
A Ruby gem for parsing and extracting structured information from CVs/resumes using LLM providers.
|
4
4
|
|
5
5
|
## Features
|
6
|
-
-
|
7
|
-
-
|
8
|
-
-
|
9
|
-
-
|
10
|
-
-
|
11
|
-
-
|
6
|
+
- **Multiple file format support**: PDF, DOCX, TXT, and Markdown files
|
7
|
+
- **Smart file processing**: Converts DOCX to PDF, processes text files directly (no upload required)
|
8
|
+
- **Extract structured data** from CVs using leading LLM providers
|
9
|
+
- **Multiple LLM providers**: OpenAI, Anthropic, and Faker (for testing)
|
10
|
+
- **Customizable output schema** using JSON Schema format
|
11
|
+
- **Command-line interface** for quick parsing and analysis
|
12
|
+
- **Performance optimized**: Text files bypass upload for faster processing
|
13
|
+
- **Robust error handling** and validation
|
12
14
|
|
13
15
|
## Installation
|
14
16
|
|
@@ -191,10 +193,22 @@ extractor.extract(
|
|
191
193
|
|
192
194
|
```ruby
|
193
195
|
extractor = CvParser::Extractor.new
|
196
|
+
|
197
|
+
# Extract from PDF (uploaded to LLM)
|
194
198
|
result = extractor.extract(
|
195
199
|
file_path: "path/to/resume.pdf"
|
196
200
|
)
|
197
201
|
|
202
|
+
# Extract from text file (fast, no upload)
|
203
|
+
result = extractor.extract(
|
204
|
+
file_path: "path/to/resume.txt"
|
205
|
+
)
|
206
|
+
|
207
|
+
# Extract from markdown file (fast, no upload)
|
208
|
+
result = extractor.extract(
|
209
|
+
file_path: "path/to/resume.md"
|
210
|
+
)
|
211
|
+
|
198
212
|
puts "Name: #{result['personal_info']['name']}"
|
199
213
|
puts "Email: #{result['personal_info']['email']}"
|
200
214
|
result['skills'].each { |skill| puts "- #{skill}" }
|
@@ -205,10 +219,15 @@ result['skills'].each { |skill| puts "- #{skill}" }
|
|
205
219
|
```ruby
|
206
220
|
begin
|
207
221
|
result = extractor.extract(
|
208
|
-
file_path: "path/to/resume.
|
222
|
+
file_path: "path/to/resume.txt" # Works with any supported format
|
209
223
|
)
|
210
224
|
rescue CvParser::FileNotFoundError, CvParser::FileNotReadableError => e
|
211
225
|
puts "File error: #{e.message}"
|
226
|
+
rescue CvParser::EmptyTextFileError => e
|
227
|
+
puts "Text file is empty: #{e.message}"
|
228
|
+
|
229
|
+
rescue CvParser::TextFileEncodingError => e
|
230
|
+
puts "Text file encoding error: #{e.message}"
|
212
231
|
rescue CvParser::ParseError => e
|
213
232
|
puts "Error parsing the response: #{e.message}"
|
214
233
|
rescue CvParser::APIError => e
|
@@ -225,10 +244,19 @@ end
|
|
225
244
|
CV Parser also provides a CLI for quick analysis:
|
226
245
|
|
227
246
|
```bash
|
247
|
+
# Process different file formats
|
228
248
|
cv-parser path/to/resume.pdf
|
249
|
+
cv-parser path/to/resume.docx
|
250
|
+
cv-parser path/to/resume.txt
|
251
|
+
cv-parser path/to/resume.md
|
252
|
+
|
253
|
+
# Use different providers
|
229
254
|
cv-parser --provider anthropic path/to/resume.pdf
|
230
|
-
cv-parser --
|
231
|
-
|
255
|
+
cv-parser --provider openai path/to/resume.txt
|
256
|
+
|
257
|
+
# Output options
|
258
|
+
cv-parser --format yaml --output result.yaml path/to/resume.md
|
259
|
+
cv-parser --schema custom-schema.json path/to/resume.txt
|
232
260
|
cv-parser --help
|
233
261
|
```
|
234
262
|
|
@@ -242,6 +270,45 @@ export CV_PARSER_API_KEY=your-api-key
|
|
242
270
|
cv-parser resume.pdf
|
243
271
|
```
|
244
272
|
|
273
|
+
## Supported File Formats
|
274
|
+
|
275
|
+
CV Parser supports multiple file formats with optimized processing:
|
276
|
+
|
277
|
+
### File Format Support
|
278
|
+
|
279
|
+
| Format | Extension | Processing Method | Upload Required | Performance |
|
280
|
+
|--------|-----------|-------------------|-----------------|-------------|
|
281
|
+
| PDF | `.pdf` | Direct upload | Yes | Standard |
|
282
|
+
| DOCX | `.docx` | Convert to PDF → Upload | Yes | Standard |
|
283
|
+
| Text | `.txt` | Direct text processing | **No** | **Fast** |
|
284
|
+
| Markdown | `.md` | Direct text processing | **No** | **Fast** |
|
285
|
+
|
286
|
+
### Performance Benefits of Text Files
|
287
|
+
|
288
|
+
Text files (`.txt` and `.md`) offer significant performance advantages:
|
289
|
+
|
290
|
+
- **No file upload overhead**: Content is included directly in the API request
|
291
|
+
- **Faster processing**: Eliminates the upload → reference workflow
|
292
|
+
- **Reduced API calls**: Single request instead of upload + process
|
293
|
+
- **Lower bandwidth usage**: Direct text inclusion vs binary file transfer
|
294
|
+
- **Better for automation**: Simpler integration in automated workflows
|
295
|
+
|
296
|
+
### File Size Limits
|
297
|
+
|
298
|
+
- **PDF/DOCX files**: Limited by LLM provider (typically 20MB)
|
299
|
+
- **Text files**: No explicit size limits (limited only by LLM provider)
|
300
|
+
|
301
|
+
### File Processing Examples
|
302
|
+
|
303
|
+
```ruby
|
304
|
+
# Fast text processing (no upload)
|
305
|
+
extractor.extract(file_path: "resume.txt", output_schema: schema)
|
306
|
+
extractor.extract(file_path: "resume.md", output_schema: schema)
|
307
|
+
|
308
|
+
# Standard file processing (with upload)
|
309
|
+
extractor.extract(file_path: "resume.pdf", output_schema: schema)
|
310
|
+
extractor.extract(file_path: "resume.docx", output_schema: schema)
|
311
|
+
```
|
245
312
|
|
246
313
|
## Advanced Configuration
|
247
314
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module CvParser
|
4
|
+
# Configuration settings for CV parser including LLM provider, API credentials, and extraction options
|
4
5
|
class Configuration
|
5
6
|
attr_accessor :provider, :model, :api_key, :timeout, :max_retries, :prompt, :system_prompt,
|
6
7
|
:output_schema, :max_tokens, :temperature
|
data/lib/cv_parser/errors.rb
CHANGED
@@ -11,4 +11,7 @@ module CvParser
|
|
11
11
|
class InvalidRequestError < APIError; end
|
12
12
|
class FileNotFoundError < Error; end
|
13
13
|
class FileNotReadableError < Error; end
|
14
|
+
class TextFileError < Error; end
|
15
|
+
class TextFileEncodingError < TextFileError; end
|
16
|
+
class EmptyTextFileError < TextFileError; end
|
14
17
|
end
|
data/lib/cv_parser/extractor.rb
CHANGED
@@ -30,13 +30,7 @@ module CvParser
|
|
30
30
|
def extract_data(output_schema:, file_path: nil)
|
31
31
|
validate_inputs!(output_schema, file_path)
|
32
32
|
|
33
|
-
|
34
|
-
base64_content = encode_file_to_base64(processed_file_path)
|
35
|
-
|
36
|
-
response = make_api_request(output_schema, base64_content)
|
37
|
-
|
38
|
-
cleanup_temp_file(processed_file_path, file_path)
|
39
|
-
|
33
|
+
response = process_file_and_get_response(file_path, output_schema)
|
40
34
|
handle_tool_response(response, output_schema)
|
41
35
|
rescue Faraday::Error => e
|
42
36
|
raise APIError, "Anthropic API connection error: #{e.message}"
|
@@ -44,6 +38,21 @@ module CvParser
|
|
44
38
|
|
45
39
|
private
|
46
40
|
|
41
|
+
def process_file_and_get_response(file_path, output_schema)
|
42
|
+
if text_file?(file_path)
|
43
|
+
# Handle text files without base64 encoding
|
44
|
+
text_content = read_text_file_content(file_path)
|
45
|
+
make_api_request_with_text(output_schema, text_content)
|
46
|
+
else
|
47
|
+
# Existing file processing logic
|
48
|
+
processed_file_path = prepare_file(file_path)
|
49
|
+
base64_content = encode_file_to_base64(processed_file_path)
|
50
|
+
response = make_api_request(output_schema, base64_content)
|
51
|
+
cleanup_temp_file(processed_file_path, file_path)
|
52
|
+
response
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
47
56
|
def validate_inputs!(output_schema, file_path)
|
48
57
|
raise ArgumentError, "File_path must be provided" unless file_path
|
49
58
|
|
@@ -122,6 +131,39 @@ module CvParser
|
|
122
131
|
}
|
123
132
|
end
|
124
133
|
|
134
|
+
def make_api_request_with_text(output_schema, text_content)
|
135
|
+
extraction_tool = build_extraction_tool(output_schema)
|
136
|
+
|
137
|
+
@client.post do |req|
|
138
|
+
req.headers["Content-Type"] = "application/json"
|
139
|
+
req.body = build_text_request_body(output_schema, extraction_tool, text_content).to_json
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def build_text_request_body(output_schema, extraction_tool, text_content)
|
144
|
+
{
|
145
|
+
model: @config.model || DEFAULT_MODEL,
|
146
|
+
max_tokens: @config.max_tokens,
|
147
|
+
temperature: @config.temperature,
|
148
|
+
system: build_system_prompt,
|
149
|
+
tools: [extraction_tool],
|
150
|
+
tool_choice: { type: "tool", name: TOOL_NAME },
|
151
|
+
messages: [build_text_message(output_schema, text_content)]
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
155
|
+
def build_text_message(output_schema, text_content)
|
156
|
+
{
|
157
|
+
role: "user",
|
158
|
+
content: [
|
159
|
+
{
|
160
|
+
type: "text",
|
161
|
+
text: "#{build_extraction_prompt(output_schema)}\n\nCV Content:\n#{text_content}"
|
162
|
+
}
|
163
|
+
]
|
164
|
+
}
|
165
|
+
end
|
166
|
+
|
125
167
|
def build_extraction_tool(output_schema)
|
126
168
|
json_schema = normalize_schema_to_json_schema(output_schema)
|
127
169
|
|
@@ -47,15 +47,16 @@ module CvParser
|
|
47
47
|
# Convert DOCX to PDF
|
48
48
|
@pdf_converter.convert(file_path, temp_pdf_path)
|
49
49
|
temp_pdf_path
|
50
|
-
when ".pdf"
|
51
|
-
#
|
50
|
+
when ".pdf", ".txt", ".md"
|
51
|
+
# PDF files, text files - return as-is
|
52
|
+
# Text files will be handled as text content by providers
|
52
53
|
file_path
|
53
54
|
else
|
54
55
|
# For other file types, let the provider handle them directly
|
55
56
|
file_path
|
56
57
|
end
|
57
58
|
rescue StandardError => e
|
58
|
-
raise APIError, "Failed to convert
|
59
|
+
raise APIError, "Failed to convert file: #{e.message}"
|
59
60
|
end
|
60
61
|
|
61
62
|
def cleanup_temp_file(processed_file_path, original_file_path)
|
@@ -114,6 +115,21 @@ module CvParser
|
|
114
115
|
|
115
116
|
raise FileNotReadableError, "File not readable: #{file_path}"
|
116
117
|
end
|
118
|
+
|
119
|
+
def text_file?(file_path)
|
120
|
+
[".txt", ".md"].include?(File.extname(file_path).downcase)
|
121
|
+
end
|
122
|
+
|
123
|
+
def read_text_file_content(file_path)
|
124
|
+
content = File.read(file_path, encoding: "UTF-8")
|
125
|
+
|
126
|
+
# Validate content is not empty
|
127
|
+
raise EmptyTextFileError, "Text file is empty: #{file_path}" if content.strip.empty?
|
128
|
+
|
129
|
+
content
|
130
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
|
131
|
+
raise TextFileEncodingError, "Invalid text encoding in file #{file_path}: #{e.message}"
|
132
|
+
end
|
117
133
|
end
|
118
134
|
end
|
119
135
|
end
|
@@ -27,17 +27,32 @@ module CvParser
|
|
27
27
|
JSON_SCHEMA_TYPE = "json_schema"
|
28
28
|
|
29
29
|
def extract_data(output_schema:, file_path: nil)
|
30
|
-
|
30
|
+
validate_inputs!(output_schema, file_path)
|
31
31
|
generate_fake_data(output_schema)
|
32
32
|
end
|
33
33
|
|
34
|
-
def upload_file(
|
34
|
+
def upload_file(_file_path)
|
35
35
|
# No-op for faker provider
|
36
36
|
{ id: "fake-file-#{SecureRandom.hex(8)}" }
|
37
37
|
end
|
38
38
|
|
39
39
|
private
|
40
40
|
|
41
|
+
def validate_inputs!(output_schema, file_path)
|
42
|
+
validate_schema_format!(output_schema)
|
43
|
+
|
44
|
+
# Validate file if provided
|
45
|
+
return unless file_path
|
46
|
+
|
47
|
+
validate_file_exists!(file_path)
|
48
|
+
validate_file_readable!(file_path)
|
49
|
+
|
50
|
+
# For text files, validate content
|
51
|
+
return unless text_file?(file_path)
|
52
|
+
|
53
|
+
read_text_file_content(file_path) # Just for validation
|
54
|
+
end
|
55
|
+
|
41
56
|
def validate_schema_format!(output_schema)
|
42
57
|
return if valid_json_schema_format?(output_schema)
|
43
58
|
|
@@ -129,7 +144,7 @@ module CvParser
|
|
129
144
|
end
|
130
145
|
end
|
131
146
|
|
132
|
-
def generate_string_value(key,
|
147
|
+
def generate_string_value(key, _description = nil)
|
133
148
|
key_string = key.to_s.downcase
|
134
149
|
|
135
150
|
case key_string
|
@@ -42,12 +42,7 @@ module CvParser
|
|
42
42
|
def extract_data(output_schema:, file_path: nil)
|
43
43
|
validate_inputs!(output_schema, file_path)
|
44
44
|
|
45
|
-
|
46
|
-
file_id = upload_file(processed_file_path)
|
47
|
-
response = create_response_with_file(file_id, output_schema)
|
48
|
-
|
49
|
-
cleanup_temp_file(processed_file_path, file_path)
|
50
|
-
|
45
|
+
response = process_file_and_get_response(file_path, output_schema)
|
51
46
|
parse_response_output(response)
|
52
47
|
rescue Timeout::Error => e
|
53
48
|
raise APIError, "OpenAI API timeout: #{e.message}"
|
@@ -74,6 +69,21 @@ module CvParser
|
|
74
69
|
|
75
70
|
private
|
76
71
|
|
72
|
+
def process_file_and_get_response(file_path, output_schema)
|
73
|
+
if text_file?(file_path)
|
74
|
+
# Handle text files without upload
|
75
|
+
text_content = read_text_file_content(file_path)
|
76
|
+
create_response_with_text(text_content, output_schema)
|
77
|
+
else
|
78
|
+
# Existing file upload logic
|
79
|
+
processed_file_path = prepare_file(file_path)
|
80
|
+
file_id = upload_file(processed_file_path)
|
81
|
+
response = create_response_with_file(file_id, output_schema)
|
82
|
+
cleanup_temp_file(processed_file_path, file_path)
|
83
|
+
response
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
77
87
|
def validate_inputs!(output_schema, file_path)
|
78
88
|
raise ArgumentError, "File_path must be provided" unless file_path
|
79
89
|
|
@@ -277,10 +287,44 @@ module CvParser
|
|
277
287
|
]
|
278
288
|
end
|
279
289
|
|
290
|
+
def create_response_with_text(text_content, schema)
|
291
|
+
uri = URI(API_RESPONSES_URL)
|
292
|
+
payload = build_text_response_payload(text_content, schema)
|
293
|
+
make_responses_api_request(uri, payload)
|
294
|
+
end
|
295
|
+
|
296
|
+
def build_text_response_payload(text_content, schema)
|
297
|
+
{
|
298
|
+
model: @config.model || DEFAULT_MODEL,
|
299
|
+
input: build_text_input_for_responses_api(text_content),
|
300
|
+
text: {
|
301
|
+
format: {
|
302
|
+
type: "json_schema",
|
303
|
+
name: SCHEMA_NAME,
|
304
|
+
schema: schema_to_json_schema(schema)
|
305
|
+
}
|
306
|
+
}
|
307
|
+
}
|
308
|
+
end
|
309
|
+
|
310
|
+
def build_text_input_for_responses_api(text_content)
|
311
|
+
[
|
312
|
+
{
|
313
|
+
role: "user",
|
314
|
+
content: [
|
315
|
+
{
|
316
|
+
type: "input_text",
|
317
|
+
text: "#{build_extraction_prompt}\n\nCV Content:\n#{text_content}"
|
318
|
+
}
|
319
|
+
]
|
320
|
+
}
|
321
|
+
]
|
322
|
+
end
|
323
|
+
|
280
324
|
def parse_response_output(response)
|
281
325
|
# Extract content from Responses API format
|
282
326
|
output = response["output"]
|
283
|
-
return nil unless output
|
327
|
+
return nil unless output.is_a?(Array) && !output.empty?
|
284
328
|
|
285
329
|
# Look for message with text content
|
286
330
|
text_content = nil
|
@@ -289,14 +333,9 @@ module CvParser
|
|
289
333
|
if item.is_a?(Hash)
|
290
334
|
if item["type"] == "message" && item["content"]
|
291
335
|
item["content"].each do |content_item|
|
292
|
-
if content_item.is_a?(Hash)
|
293
|
-
|
294
|
-
|
295
|
-
break
|
296
|
-
elsif content_item["type"] == "output_text"
|
297
|
-
text_content = content_item["text"]
|
298
|
-
break
|
299
|
-
end
|
336
|
+
if content_item.is_a?(Hash) && %w[text output_text].include?(content_item["type"])
|
337
|
+
text_content = content_item["text"]
|
338
|
+
break
|
300
339
|
end
|
301
340
|
end
|
302
341
|
elsif item["type"] == "text" && item["text"]
|
data/lib/cv_parser/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cv-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gys Muller
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-08-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: base64
|