cv-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,395 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+ require "json"
6
+ require "mime/types"
7
+ require "securerandom"
8
+ require "timeout"
9
+ require_relative "../pdf_converter"
10
+
11
+ module CvParser
12
+ module Providers
13
+ class OpenAI < Base
14
+ API_BASE_URL = "https://api.openai.com/v1"
15
+ API_FILE_URL = "https://api.openai.com/v1/files"
16
+ API_RESPONSES_URL = "https://api.openai.com/v1/responses"
17
+ DEFAULT_MODEL = "gpt-4.1-mini"
18
+
19
+ # HTTP Status codes
20
+ HTTP_OK = 200
21
+ HTTP_BAD_REQUEST = 400
22
+ HTTP_UNAUTHORIZED = 401
23
+ HTTP_TOO_MANY_REQUESTS = 429
24
+ HTTP_CLIENT_ERROR_START = 400
25
+ HTTP_CLIENT_ERROR_END = 499
26
+ HTTP_SERVER_ERROR_START = 500
27
+ HTTP_SERVER_ERROR_END = 599
28
+
29
+ # Constants
30
+ SCHEMA_NAME = "cv_data_extraction"
31
+ FILE_PURPOSE = "assistants"
32
+ MULTIPART_BOUNDARY_PREFIX = "----cv-parser-"
33
+ DEFAULT_MIME_TYPE = "application/octet-stream"
34
+
35
+ def initialize(config)
36
+ super
37
+ @api_key = @config.api_key
38
+ @timeout = @config.timeout || 60
39
+ @client = setup_client
40
+ end
41
+
42
+ def extract_data(output_schema:, file_path: nil)
43
+ validate_inputs!(output_schema, file_path)
44
+
45
+ processed_file_path = prepare_file(file_path)
46
+ file_id = upload_file(processed_file_path)
47
+ response = create_response_with_file(file_id, output_schema)
48
+
49
+ cleanup_temp_file(processed_file_path, file_path)
50
+
51
+ parse_response_output(response)
52
+ rescue Timeout::Error => e
53
+ raise APIError, "OpenAI API timeout: #{e.message}"
54
+ rescue Net::HTTPError => e
55
+ handle_http_error(e)
56
+ rescue JSON::ParserError => e
57
+ raise ParseError, "Failed to parse OpenAI response as JSON: #{e.message}"
58
+ end
59
+
60
+ def upload_file(file_path)
61
+ uri = URI(API_FILE_URL)
62
+ file_content, mime_type, filename = prepare_file_upload_data(file_path)
63
+
64
+ boundary = generate_boundary
65
+ form_data = build_multipart_form_data(file_content, filename, mime_type, boundary)
66
+
67
+ request = build_upload_request(uri, form_data, boundary)
68
+ response = make_http_request(uri, request)
69
+
70
+ handle_upload_response(response)
71
+ rescue StandardError => e
72
+ raise APIError, "OpenAI API error during file upload: #{e.message}"
73
+ end
74
+
75
+ private
76
+
77
+ def validate_inputs!(output_schema, file_path)
78
+ raise ArgumentError, "File_path must be provided" unless file_path
79
+
80
+ validate_schema_format!(output_schema)
81
+ validate_file_exists!(file_path)
82
+ validate_file_readable!(file_path)
83
+ end
84
+
85
+ def validate_schema_format!(output_schema)
86
+ return if valid_json_schema_format?(output_schema)
87
+
88
+ raise ArgumentError, "The OpenAI provider requires a JSON Schema format with 'type: \"json_schema\"'"
89
+ end
90
+
91
+ def valid_json_schema_format?(schema)
92
+ schema.is_a?(Hash) &&
93
+ ((schema.key?("type") && schema["type"] == "json_schema") ||
94
+ (schema.key?(:type) && schema[:type] == "json_schema"))
95
+ end
96
+
97
+ def prepare_file(file_path)
98
+ convert_to_pdf_if_needed(file_path)
99
+ end
100
+
101
+ def prepare_file_upload_data(file_path)
102
+ file_content = File.read(file_path, mode: "rb")
103
+ mime_type = MIME::Types.type_for(file_path).first&.content_type || DEFAULT_MIME_TYPE
104
+ filename = File.basename(file_path)
105
+
106
+ [file_content, mime_type, filename]
107
+ end
108
+
109
+ def generate_boundary
110
+ "#{MULTIPART_BOUNDARY_PREFIX}#{SecureRandom.hex(16)}"
111
+ end
112
+
113
+ def build_upload_request(uri, form_data, boundary)
114
+ request = Net::HTTP::Post.new(uri)
115
+ request.body = form_data
116
+ request["Content-Type"] = "multipart/form-data; boundary=#{boundary}"
117
+ @base_headers.each { |key, value| request[key] = value }
118
+ request
119
+ end
120
+
121
+ def handle_upload_response(response)
122
+ if response.code.to_i == HTTP_OK
123
+ result = JSON.parse(response.body)
124
+ result["id"]
125
+ else
126
+ handle_error_response(response, "file upload")
127
+ end
128
+ end
129
+
130
+ def setup_client
131
+ {
132
+ http_class: Net::HTTP,
133
+ timeout: @timeout,
134
+ headers: @base_headers
135
+ }
136
+ end
137
+
138
+ def create_response_with_file(file_id, schema)
139
+ uri = URI(API_RESPONSES_URL)
140
+ payload = build_response_payload(file_id, schema)
141
+ make_responses_api_request(uri, payload)
142
+ end
143
+
144
+ def build_response_payload(file_id, schema)
145
+ {
146
+ model: @config.model || DEFAULT_MODEL,
147
+ input: build_file_input_for_responses_api(file_id),
148
+ text: {
149
+ format: {
150
+ type: "json_schema",
151
+ name: SCHEMA_NAME,
152
+ schema: schema_to_json_schema(schema)
153
+ }
154
+ }
155
+ }
156
+ end
157
+
158
+ def schema_to_json_schema(schema)
159
+ properties = extract_properties_from_schema(schema)
160
+ processed_properties = ensure_additional_properties(properties)
161
+
162
+ {
163
+ type: "object",
164
+ properties: processed_properties,
165
+ required: processed_properties.keys,
166
+ additionalProperties: false
167
+ }
168
+ end
169
+
170
+ def extract_properties_from_schema(schema)
171
+ if schema.key?("properties")
172
+ schema["properties"]
173
+ elsif schema.key?(:properties)
174
+ schema[:properties]
175
+ else
176
+ raise ArgumentError, "Invalid schema format. Please use JSON Schema format with 'type: \"json_schema\"'"
177
+ end
178
+ end
179
+
180
+ def ensure_additional_properties(properties)
181
+ result = {}
182
+ properties.each do |key, value|
183
+ result[key] = process_property_value(value)
184
+ end
185
+ result
186
+ end
187
+
188
+ def process_property_value(value)
189
+ return value unless value.is_a?(Hash)
190
+
191
+ case property_type(value)
192
+ when "object"
193
+ process_object_property(value)
194
+ when "array"
195
+ process_array_property(value)
196
+ else
197
+ value
198
+ end
199
+ end
200
+
201
+ def property_type(value)
202
+ value["type"] || value[:type]
203
+ end
204
+
205
+ def process_object_property(value)
206
+ nested_props = value["properties"] || value[:properties] || {}
207
+ processed_nested_props = ensure_additional_properties(nested_props)
208
+
209
+ value.merge(
210
+ additionalProperties: false,
211
+ properties: processed_nested_props,
212
+ required: processed_nested_props.keys
213
+ )
214
+ end
215
+
216
+ def process_array_property(value)
217
+ items = value["items"] || value[:items]
218
+ return value unless items.is_a?(Hash) && property_type(items) == "object"
219
+
220
+ nested_props = items["properties"] || items[:properties] || {}
221
+ processed_nested_props = ensure_additional_properties(nested_props)
222
+ updated_items = items.merge(
223
+ additionalProperties: false,
224
+ properties: processed_nested_props,
225
+ required: processed_nested_props.keys
226
+ )
227
+
228
+ value.merge(items: updated_items)
229
+ end
230
+
231
+ def make_responses_api_request(uri, payload)
232
+ request = build_json_request(uri, payload)
233
+ response = make_http_request(uri, request)
234
+ handle_responses_api_response(response)
235
+ end
236
+
237
+ def build_json_request(uri, payload)
238
+ request = Net::HTTP::Post.new(uri)
239
+ request.body = payload.to_json
240
+ request["Content-Type"] = "application/json"
241
+ @base_headers.each { |key, value| request[key] = value }
242
+ request
243
+ end
244
+
245
+ def handle_responses_api_response(response)
246
+ if response.code.to_i == HTTP_OK
247
+ JSON.parse(response.body)
248
+ else
249
+ handle_error_response(response, "responses API")
250
+ end
251
+ end
252
+
253
+ def make_http_request(uri, request)
254
+ http = @client[:http_class].new(uri.host, uri.port)
255
+ http.use_ssl = true
256
+ http.read_timeout = @client[:timeout]
257
+ http.open_timeout = @client[:timeout]
258
+
259
+ http.request(request)
260
+ end
261
+
262
+ def build_file_input_for_responses_api(file_id)
263
+ [
264
+ {
265
+ role: "user",
266
+ content: [
267
+ {
268
+ type: "input_text",
269
+ text: build_extraction_prompt
270
+ },
271
+ {
272
+ type: "input_file",
273
+ file_id: file_id
274
+ }
275
+ ]
276
+ }
277
+ ]
278
+ end
279
+
280
+ def parse_response_output(response)
281
+ # Extract content from Responses API format
282
+ output = response["output"]
283
+ return nil unless output&.is_a?(Array) && !output.empty?
284
+
285
+ # Look for message with text content
286
+ text_content = nil
287
+
288
+ output.each do |item|
289
+ if item.is_a?(Hash)
290
+ if item["type"] == "message" && item["content"]
291
+ item["content"].each do |content_item|
292
+ if content_item.is_a?(Hash)
293
+ if content_item["type"] == "text"
294
+ text_content = content_item["text"]
295
+ break
296
+ elsif content_item["type"] == "output_text"
297
+ text_content = content_item["text"]
298
+ break
299
+ end
300
+ end
301
+ end
302
+ elsif item["type"] == "text" && item["text"]
303
+ text_content = item["text"]
304
+ end
305
+ end
306
+ break if text_content
307
+ end
308
+
309
+ return nil unless text_content
310
+
311
+ # Parse the JSON content
312
+ begin
313
+ JSON.parse(text_content)
314
+ rescue JSON::ParserError => e
315
+ # If direct parsing fails, try to extract JSON from text
316
+ raise ParseError, "Failed to parse OpenAI response as JSON: #{e.message}" unless text_content =~ /\{.*\}/m
317
+
318
+ json_text = text_content.match(/\{.*\}/m)[0]
319
+ JSON.parse(json_text)
320
+ end
321
+ end
322
+
323
+ def handle_error_response(response, context)
324
+ error_info = parse_error_body(response.body)
325
+ error_message = error_info.dig("error", "message") || "Unknown error"
326
+ status_code = response.code.to_i
327
+
328
+ case status_code
329
+ when HTTP_TOO_MANY_REQUESTS
330
+ raise RateLimitError, "OpenAI rate limit exceeded during #{context}: #{error_message}"
331
+ when HTTP_CLIENT_ERROR_START..HTTP_CLIENT_ERROR_END
332
+ raise APIError, "OpenAI API client error during #{context} (#{status_code}): #{error_message}"
333
+ when HTTP_SERVER_ERROR_START..HTTP_SERVER_ERROR_END
334
+ raise APIError, "OpenAI API server error during #{context} (#{status_code}): #{error_message}"
335
+ else
336
+ raise APIError, "OpenAI API error during #{context} (#{status_code}): #{error_message}"
337
+ end
338
+ end
339
+
340
+ def parse_error_body(error_body)
341
+ JSON.parse(error_body)
342
+ rescue JSON::ParserError
343
+ { "error" => { "message" => error_body } }
344
+ end
345
+
346
+ def handle_http_error(error)
347
+ raise RateLimitError, "OpenAI rate limit exceeded: #{error.message}" if rate_limit_error?(error)
348
+
349
+ raise APIError, "OpenAI API error: #{error.message}"
350
+ end
351
+
352
+ def rate_limit_error?(error)
353
+ error.message.include?("rate limit") || error.message.include?("429")
354
+ end
355
+
356
+ def build_multipart_form_data(file_content, filename, mime_type, boundary)
357
+ form_data = ""
358
+ form_data += build_file_field(file_content, filename, mime_type, boundary)
359
+ form_data += build_purpose_field(boundary)
360
+ form_data += build_end_boundary(boundary)
361
+ form_data
362
+ end
363
+
364
+ def build_file_field(file_content, filename, mime_type, boundary)
365
+ field = ""
366
+ field += "--#{boundary}\r\n"
367
+ field += "Content-Disposition: form-data; name=\"file\"; filename=\"#{filename}\"\r\n"
368
+ field += "Content-Type: #{mime_type}\r\n\r\n"
369
+ field += file_content
370
+ field += "\r\n"
371
+ field
372
+ end
373
+
374
+ def build_purpose_field(boundary)
375
+ field = ""
376
+ field += "--#{boundary}\r\n"
377
+ field += "Content-Disposition: form-data; name=\"purpose\"\r\n\r\n"
378
+ field += FILE_PURPOSE
379
+ field += "\r\n"
380
+ field
381
+ end
382
+
383
+ def build_end_boundary(boundary)
384
+ "--#{boundary}--\r\n"
385
+ end
386
+
387
+ protected
388
+
389
+ def setup_http_client
390
+ super
391
+ @base_headers["Authorization"] = "Bearer #{@api_key}"
392
+ end
393
+ end
394
+ end
395
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CvParser
4
+ VERSION = "0.1.0"
5
+ end
data/lib/cv_parser.rb ADDED
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "cv_parser/version"
4
+ require_relative "cv_parser/configuration"
5
+ require_relative "cv_parser/errors"
6
+ require_relative "cv_parser/providers/base"
7
+ require_relative "cv_parser/providers/openai"
8
+ require_relative "cv_parser/providers/anthropic"
9
+ require_relative "cv_parser/providers/faker"
10
+ require_relative "cv_parser/extractor"
11
+ require_relative "cv_parser/cli"
12
+
13
+ module CvParser
14
+ class << self
15
+ def configuration
16
+ @configuration ||= Configuration.new
17
+ end
18
+
19
+ def configure
20
+ if block_given?
21
+ yield(configuration)
22
+ # The block is expected to create a new Configuration and assign it
23
+ # to @configuration via instance_variable_set, but if not, we can
24
+ # still use the default configuration initialized above
25
+ end
26
+ configuration
27
+ end
28
+
29
+ def reset
30
+ @configuration = Configuration.new
31
+ end
32
+ end
33
+
34
+ class Error < StandardError; end
35
+
36
+ # Your code goes here...
37
+ end
metadata ADDED
@@ -0,0 +1,192 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cv-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Gys Muller
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-06-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: base64
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.2'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faraday
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faraday-multipart
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fiddle
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: json
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.6'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '2.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: mime-types
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.5'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.5'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rdoc
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '6.6'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '6.6'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rexml
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.2'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.2'
125
+ - !ruby/object:Gem::Dependency
126
+ name: zlib
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '3.0'
139
+ description: CV Parser is a Ruby gem that extracts structured information from CVs
140
+ and resumes in various formats using LLMs.
141
+ email:
142
+ - gysmuller@users.noreply.github.com
143
+ executables:
144
+ - cv-parser
145
+ extensions: []
146
+ extra_rdoc_files: []
147
+ files:
148
+ - CHANGELOG.md
149
+ - LICENSE.txt
150
+ - README.md
151
+ - bin/console
152
+ - bin/setup
153
+ - exe/cv-parser
154
+ - lib/cv_parser.rb
155
+ - lib/cv_parser/cli.rb
156
+ - lib/cv_parser/configuration.rb
157
+ - lib/cv_parser/errors.rb
158
+ - lib/cv_parser/extractor.rb
159
+ - lib/cv_parser/pdf_converter.rb
160
+ - lib/cv_parser/providers/anthropic.rb
161
+ - lib/cv_parser/providers/base.rb
162
+ - lib/cv_parser/providers/faker.rb
163
+ - lib/cv_parser/providers/openai.rb
164
+ - lib/cv_parser/version.rb
165
+ homepage: https://github.com/gysmuller/cv-parser
166
+ licenses:
167
+ - MIT
168
+ metadata:
169
+ homepage_uri: https://github.com/gysmuller/cv-parser
170
+ source_code_uri: https://github.com/gysmuller/cv-parser
171
+ changelog_uri: https://github.com/gysmuller/cv-parser/blob/main/CHANGELOG.md
172
+ rubygems_mfa_required: 'true'
173
+ post_install_message:
174
+ rdoc_options: []
175
+ require_paths:
176
+ - lib
177
+ required_ruby_version: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - ">="
180
+ - !ruby/object:Gem::Version
181
+ version: 2.6.0
182
+ required_rubygems_version: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
187
+ requirements: []
188
+ rubygems_version: 3.5.16
189
+ signing_key:
190
+ specification_version: 4
191
+ summary: A Ruby gem for parsing CVs/resumes using LLMs
192
+ test_files: []