cv-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "json"
5
+ require "mime/types"
6
+ require "base64"
7
+ require "faraday/multipart"
8
+ require_relative "../pdf_converter"
9
+ require "securerandom"
10
+
11
+ module CvParser
12
+ module Providers
13
+ class Anthropic < Base
14
+ ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
15
+ ANTHROPIC_API_VERSION = "2023-06-01"
16
+ DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
17
+ TOOL_NAME = "extract_cv_data"
18
+
19
+ # HTTP Status codes
20
+ HTTP_OK = 200
21
+ HTTP_BAD_REQUEST = 400
22
+ HTTP_UNAUTHORIZED = 401
23
+ HTTP_TOO_MANY_REQUESTS = 429
24
+
25
+ def initialize(config)
26
+ super
27
+ @client = setup_client
28
+ end
29
+
30
+ def extract_data(output_schema:, file_path: nil)
31
+ validate_inputs!(output_schema, file_path)
32
+
33
+ processed_file_path = prepare_file(file_path)
34
+ base64_content = encode_file_to_base64(processed_file_path)
35
+
36
+ response = make_api_request(output_schema, base64_content)
37
+
38
+ cleanup_temp_file(processed_file_path, file_path)
39
+
40
+ handle_tool_response(response, output_schema)
41
+ rescue Faraday::Error => e
42
+ raise APIError, "Anthropic API connection error: #{e.message}"
43
+ end
44
+
45
+ private
46
+
47
+ def validate_inputs!(output_schema, file_path)
48
+ raise ArgumentError, "File_path must be provided" unless file_path
49
+
50
+ validate_schema_format!(output_schema)
51
+ validate_file_exists!(file_path)
52
+ validate_file_readable!(file_path)
53
+ end
54
+
55
+ def validate_schema_format!(output_schema)
56
+ return if valid_json_schema_format?(output_schema)
57
+
58
+ raise ArgumentError, "The Anthropic provider requires a JSON Schema format with 'type: \"json_schema\"'"
59
+ end
60
+
61
+ def valid_json_schema_format?(schema)
62
+ schema.is_a?(Hash) &&
63
+ ((schema.key?("type") && schema["type"] == "json_schema") ||
64
+ (schema.key?(:type) && schema[:type] == "json_schema"))
65
+ end
66
+
67
+ def prepare_file(file_path)
68
+ convert_to_pdf_if_needed(file_path)
69
+ end
70
+
71
+ def encode_file_to_base64(file_path)
72
+ pdf_content = File.read(file_path)
73
+ Base64.strict_encode64(pdf_content)
74
+ end
75
+
76
+ def make_api_request(output_schema, base64_content)
77
+ extraction_tool = build_extraction_tool(output_schema)
78
+
79
+ @client.post do |req|
80
+ req.headers["Content-Type"] = "application/json"
81
+ req.body = build_request_body(output_schema, extraction_tool, base64_content).to_json
82
+ end
83
+ end
84
+
85
+ def build_request_body(output_schema, extraction_tool, base64_content)
86
+ {
87
+ model: @config.model || DEFAULT_MODEL,
88
+ max_tokens: @config.max_tokens,
89
+ temperature: @config.temperature,
90
+ system: build_system_prompt,
91
+ tools: [extraction_tool],
92
+ tool_choice: { type: "tool", name: TOOL_NAME },
93
+ messages: [build_message(output_schema, base64_content)]
94
+ }
95
+ end
96
+
97
+ def build_message(output_schema, base64_content)
98
+ {
99
+ role: "user",
100
+ content: [
101
+ build_document_content(base64_content),
102
+ build_text_content(output_schema)
103
+ ]
104
+ }
105
+ end
106
+
107
+ def build_document_content(base64_content)
108
+ {
109
+ type: "document",
110
+ source: {
111
+ type: "base64",
112
+ media_type: "application/pdf",
113
+ data: base64_content
114
+ }
115
+ }
116
+ end
117
+
118
+ def build_text_content(output_schema)
119
+ {
120
+ type: "text",
121
+ text: build_extraction_prompt(output_schema)
122
+ }
123
+ end
124
+
125
+ def build_extraction_tool(output_schema)
126
+ json_schema = normalize_schema_to_json_schema(output_schema)
127
+
128
+ {
129
+ name: TOOL_NAME,
130
+ description: "Extract structured data from a CV/resume document according to the provided schema. Always use this tool to return the extracted data in the exact format specified by the schema.",
131
+ input_schema: json_schema
132
+ }
133
+ end
134
+
135
+ def normalize_schema_to_json_schema(schema)
136
+ # Extract the properties from the JSON Schema format
137
+ properties = extract_properties_from_schema(schema)
138
+ required = extract_required_from_schema(schema)
139
+
140
+ {
141
+ type: "object",
142
+ properties: properties,
143
+ required: required
144
+ }
145
+ end
146
+
147
+ def extract_properties_from_schema(schema)
148
+ if schema.key?("properties")
149
+ schema["properties"]
150
+ elsif schema.key?(:properties)
151
+ schema[:properties]
152
+ else
153
+ {}
154
+ end
155
+ end
156
+
157
+ def extract_required_from_schema(schema)
158
+ if schema.key?("required")
159
+ schema["required"]
160
+ elsif schema.key?(:required)
161
+ schema[:required]
162
+ else
163
+ []
164
+ end
165
+ end
166
+
167
+ def handle_tool_response(response, _schema)
168
+ case response.status
169
+ when HTTP_OK
170
+ extract_tool_data_from_response(response)
171
+ when HTTP_TOO_MANY_REQUESTS
172
+ handle_rate_limit_error(response)
173
+ when HTTP_UNAUTHORIZED
174
+ raise AuthenticationError, "Invalid API key or unauthorized access"
175
+ when HTTP_BAD_REQUEST
176
+ handle_bad_request_error(response)
177
+ else
178
+ handle_generic_api_error(response)
179
+ end
180
+ rescue JSON::ParserError => e
181
+ raise ParseError, "Failed to parse Anthropic response as JSON: #{e.message}"
182
+ end
183
+
184
+ def extract_tool_data_from_response(response)
185
+ response_body = response.body
186
+ content = response_body["content"]
187
+
188
+ raise ParseError, "Unexpected Anthropic response format: content is not an array" unless content.is_a?(Array)
189
+
190
+ tool_use_block = find_tool_use_block(content)
191
+ validate_tool_use_block(tool_use_block)
192
+
193
+ extracted_data = tool_use_block["input"]
194
+ raise ParseError, "Tool input is not a hash/object as expected" unless extracted_data.is_a?(Hash)
195
+
196
+ extracted_data
197
+ end
198
+
199
+ def find_tool_use_block(content)
200
+ content.find { |block| block["type"] == "tool_use" }
201
+ end
202
+
203
+ def validate_tool_use_block(tool_use_block)
204
+ raise ParseError, "No tool_use block found in Claude's response" unless tool_use_block
205
+
206
+ return if tool_use_block["name"] == TOOL_NAME
207
+
208
+ raise ParseError, "Unexpected tool used: #{tool_use_block["name"]}"
209
+ end
210
+
211
+ def handle_rate_limit_error(response)
212
+ retry_after = response.headers["retry-after"]
213
+ message = retry_after ? "Rate limit exceeded, retry after #{retry_after} seconds" : "Rate limit exceeded"
214
+ raise RateLimitError, message
215
+ end
216
+
217
+ def handle_bad_request_error(response)
218
+ error_message = response.body.dig("error", "message") || "Bad request"
219
+ raise InvalidRequestError, "Anthropic API error: #{error_message}"
220
+ end
221
+
222
+ def handle_generic_api_error(response)
223
+ error_type = response.body.dig("error", "type") || "unknown"
224
+ error_message = response.body.dig("error", "message") || "Unknown error"
225
+ raise APIError, "Anthropic API error: #{response.status} - #{error_type} - #{error_message}"
226
+ end
227
+
228
+ protected
229
+
230
+ # Sets up the Faraday HTTP client with proper headers and configuration
231
+ def setup_http_client
232
+ super
233
+ @base_headers["x-api-key"] = @api_key
234
+ @base_headers["anthropic-version"] = ANTHROPIC_API_VERSION
235
+ end
236
+
237
+ # Configures and returns a Faraday client instance
238
+ def setup_client
239
+ Faraday.new(url: ANTHROPIC_API_URL) do |f|
240
+ f.options.timeout = @timeout
241
+ f.request :json
242
+ f.response :json
243
+ f.adapter Faraday.default_adapter
244
+ @base_headers.each { |key, value| f.headers[key] = value }
245
+ end
246
+ end
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "securerandom"
4
+ require_relative "../pdf_converter"
5
+
6
+ module CvParser
7
+ module Providers
8
+ # Base class for CV parsing providers that defines the common interface
9
+ # and shared functionality for extracting structured data from CV files.
10
+ class Base
11
+ def initialize(config)
12
+ @config = config
13
+ @pdf_converter = CvParser::PdfConverter.new
14
+ setup_http_client
15
+ end
16
+
17
+ def extract_data(output_schema:, file_path: nil)
18
+ raise NotImplementedError, "Subclasses must implement extract_data"
19
+ end
20
+
21
+ def upload_file(file_path)
22
+ raise NotImplementedError, "Subclasses must implement upload_file"
23
+ end
24
+
25
+ protected
26
+
27
+ def setup_http_client
28
+ @api_key = @config.api_key
29
+ @timeout = @config.timeout || 60
30
+ @base_headers = {
31
+ "User-Agent" => "cv-parser-ruby/#{CvParser::VERSION}",
32
+ **@config.provider_options.fetch(:headers, {})
33
+ }
34
+ end
35
+
36
+ def convert_to_pdf_if_needed(file_path)
37
+ file_ext = File.extname(file_path).downcase
38
+
39
+ case file_ext
40
+ when ".docx"
41
+ # Generate a temporary PDF file path
42
+ temp_pdf_path = File.join(
43
+ File.dirname(file_path),
44
+ "#{File.basename(file_path, file_ext)}_converted_#{SecureRandom.hex(8)}.pdf"
45
+ )
46
+
47
+ # Convert DOCX to PDF
48
+ @pdf_converter.convert(file_path, temp_pdf_path)
49
+ temp_pdf_path
50
+ when ".pdf"
51
+ # Already a PDF, return as-is
52
+ file_path
53
+ else
54
+ # For other file types, let the provider handle them directly
55
+ file_path
56
+ end
57
+ rescue StandardError => e
58
+ raise APIError, "Failed to convert DOCX to PDF: #{e.message}"
59
+ end
60
+
61
+ def cleanup_temp_file(processed_file_path, original_file_path)
62
+ # Only delete if we created a temporary converted file
63
+ if processed_file_path != original_file_path && File.exist?(processed_file_path)
64
+ File.delete(processed_file_path)
65
+ end
66
+ rescue StandardError => e
67
+ # Log the error but don't fail the main operation
68
+ warn "Warning: Failed to cleanup temporary file #{processed_file_path}: #{e.message}"
69
+ end
70
+
71
+ def build_extraction_prompt(schema = nil)
72
+ default_prompt = <<~PROMPT
73
+ Extract structured information from the attached CV/Resume as JSON.
74
+
75
+ Instructions:
76
+ 1. Extract all the requested fields from the CV.
77
+ 2. Maintain the exact structure defined in the schema.
78
+ 3. If information for a field is not available, use null or empty arrays as appropriate.
79
+ 4. For dates, use the format provided in the CV.
80
+ 5. Return only raw JSON without any markdown formatting, code blocks, or additional explanations.
81
+ 6. Do not prefix your response with ```json or any other markdown syntax.
82
+ 7. Start your response with the opening curly brace { and end with the closing curly brace }.
83
+ PROMPT
84
+
85
+ prompt = @config.prompt || default_prompt
86
+
87
+ if schema
88
+ prompt += <<~SCHEMA
89
+
90
+ The output should be formatted as JSON with the following schema:
91
+ #{schema.to_json}
92
+ SCHEMA
93
+ end
94
+
95
+ prompt
96
+ end
97
+
98
+ def build_system_prompt
99
+ return @config.system_prompt if @config.system_prompt
100
+
101
+ <<~PROMPT
102
+ You are a CV parsing assistant. Extract structured information from the attached CV/Resume.
103
+ PROMPT
104
+ end
105
+
106
+ def validate_file_exists!(file_path)
107
+ return if File.exist?(file_path)
108
+
109
+ raise FileNotFoundError, "File not found: #{file_path}"
110
+ end
111
+
112
+ def validate_file_readable!(file_path)
113
+ return if File.readable?(file_path)
114
+
115
+ raise FileNotReadableError, "File not readable: #{file_path}"
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,215 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "securerandom"
4
+
5
+ module CvParser
6
+ module Providers
7
+ class Faker < Base
8
+ # Sample data constants
9
+ SKILLS = ["Ruby", "JavaScript", "Python", "React", "Java", "C#", "PHP", "Go", "Swift", "Kotlin"].freeze
10
+ JOB_TITLES = ["Software Engineer", "Full Stack Developer", "DevOps Engineer", "Data Scientist",
11
+ "Product Manager", "UX Designer", "Frontend Developer", "Backend Developer"].freeze
12
+ COMPANIES = %w[Google Microsoft Amazon Facebook Apple Netflix Tesla Airbnb].freeze
13
+ UNIVERSITIES = ["Stanford University", "MIT", "Harvard", "Berkeley", "Oxford", "Cambridge"].freeze
14
+ DEGREES = ["Bachelor of Science", "Master of Science", "PhD", "MBA"].freeze
15
+ MAJORS = ["Computer Science", "Software Engineering", "Electrical Engineering", "Data Science"].freeze
16
+ LANGUAGES = %w[English Spanish French German Mandarin].freeze
17
+
18
+ # Date ranges
19
+ START_YEAR_RANGE = (2010..2020).freeze
20
+ END_YEAR_RANGE = (2015..2022).freeze
21
+ GENERAL_YEAR_RANGE = (2010..2023).freeze
22
+
23
+ # Array size range
24
+ ARRAY_SIZE_RANGE = (1..3).freeze
25
+
26
+ # Schema types
27
+ JSON_SCHEMA_TYPE = "json_schema"
28
+
29
+ def extract_data(output_schema:, file_path: nil)
30
+ validate_schema_format!(output_schema)
31
+ generate_fake_data(output_schema)
32
+ end
33
+
34
+ def upload_file(file_path)
35
+ # No-op for faker provider
36
+ { id: "fake-file-#{SecureRandom.hex(8)}" }
37
+ end
38
+
39
+ private
40
+
41
+ def validate_schema_format!(output_schema)
42
+ return if valid_json_schema_format?(output_schema)
43
+
44
+ raise ArgumentError, "The Faker provider requires a JSON Schema format with 'type: \"json_schema\"'"
45
+ end
46
+
47
+ def valid_json_schema_format?(schema)
48
+ schema.is_a?(Hash) &&
49
+ ((schema.key?("type") && schema["type"] == JSON_SCHEMA_TYPE) ||
50
+ (schema.key?(:type) && schema[:type] == JSON_SCHEMA_TYPE))
51
+ end
52
+
53
+ def generate_fake_data(schema)
54
+ return generate_fake_data_from_json_schema(schema) if json_schema_format?(schema)
55
+ return generate_fake_data_from_hash(schema) if schema.is_a?(Hash)
56
+ return generate_fake_data_from_array(schema) if schema.is_a?(Array) && !schema.empty?
57
+
58
+ "fake-value"
59
+ end
60
+
61
+ def json_schema_format?(schema)
62
+ schema.is_a?(Hash) &&
63
+ ((schema.key?("type") && schema["type"] == JSON_SCHEMA_TYPE) ||
64
+ (schema.key?(:type) && schema[:type] == JSON_SCHEMA_TYPE))
65
+ end
66
+
67
+ def generate_fake_data_from_json_schema(schema)
68
+ properties = extract_properties_from_schema(schema)
69
+ generate_fake_data_from_properties(properties)
70
+ end
71
+
72
+ def extract_properties_from_schema(schema)
73
+ schema["properties"] || schema[:properties] || {}
74
+ end
75
+
76
+ def generate_fake_data_from_hash(schema)
77
+ result = {}
78
+ schema.each do |key, type|
79
+ result[key.to_s] = generate_value_for_type(type, key)
80
+ end
81
+ result
82
+ end
83
+
84
+ def generate_fake_data_from_array(schema)
85
+ count = rand(ARRAY_SIZE_RANGE)
86
+ Array.new(count) { generate_fake_data(schema.first) }
87
+ end
88
+
89
+ def generate_fake_data_from_properties(properties)
90
+ result = {}
91
+ properties.each do |key, type|
92
+ result[key.to_s] = generate_value_for_type(type, key)
93
+ end
94
+ result
95
+ end
96
+
97
+ def generate_value_for_type(type, key)
98
+ return generate_value_from_typed_hash(type, key) if typed_hash?(type)
99
+ return generate_fake_data(type) if type.is_a?(Hash)
100
+ return generate_fake_data_from_array(type) if type.is_a?(Array)
101
+
102
+ generate_string_value(key, nil)
103
+ end
104
+
105
+ def typed_hash?(type)
106
+ type.is_a?(Hash) && (type.key?("type") || type.key?(:type))
107
+ end
108
+
109
+ def generate_value_from_typed_hash(type, key)
110
+ schema_type = type["type"] || type[:type]
111
+ description = type["description"] || type[:description]
112
+
113
+ case schema_type
114
+ when "object"
115
+ properties = type["properties"] || type[:properties] || {}
116
+ generate_fake_data(properties)
117
+ when "array"
118
+ items = type["items"] || type[:items]
119
+ count = rand(ARRAY_SIZE_RANGE)
120
+ Array.new(count) { generate_value_for_type(items, key) }
121
+ when "string"
122
+ generate_string_value(key, description)
123
+ when "number", "integer"
124
+ rand(1..100)
125
+ when "boolean"
126
+ [true, false].sample
127
+ else
128
+ "fake-value"
129
+ end
130
+ end
131
+
132
+ def generate_string_value(key, description = nil)
133
+ key_string = key.to_s.downcase
134
+
135
+ case key_string
136
+ when /name/
137
+ generate_name_value
138
+ when /email/
139
+ generate_email_value
140
+ when /phone/
141
+ generate_phone_value
142
+ when /address/
143
+ generate_address_value
144
+ when /summary/, /objective/, /description/
145
+ generate_description_value
146
+ when /skill/
147
+ SKILLS.sample
148
+ when /title/, /position/, /role/
149
+ JOB_TITLES.sample
150
+ when /company/, /employer/, /organization/
151
+ COMPANIES.sample
152
+ when /university/, /school/, /college/
153
+ UNIVERSITIES.sample
154
+ when /degree/
155
+ DEGREES.sample
156
+ when /major/, /field/
157
+ MAJORS.sample
158
+ when /year/, /years/
159
+ generate_years_value
160
+ when /start_date/, /start/
161
+ generate_date_value(START_YEAR_RANGE)
162
+ when /end_date/, /end/
163
+ generate_date_value(END_YEAR_RANGE)
164
+ when /date/
165
+ generate_date_value(GENERAL_YEAR_RANGE)
166
+ when /url/, /website/, /link/
167
+ generate_url_value
168
+ when /language/
169
+ LANGUAGES.sample
170
+ else
171
+ generate_default_value(key)
172
+ end
173
+ end
174
+
175
+ def generate_name_value
176
+ "John Doe"
177
+ end
178
+
179
+ def generate_email_value
180
+ "john.doe@example.com"
181
+ end
182
+
183
+ def generate_phone_value
184
+ "+1 (555) 123-4567"
185
+ end
186
+
187
+ def generate_address_value
188
+ "123 Main St, Anytown, CA 94088"
189
+ end
190
+
191
+ def generate_description_value
192
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
193
+ end
194
+
195
+ def generate_years_value
196
+ rand(1..10).to_s
197
+ end
198
+
199
+ def generate_date_value(year_range)
200
+ year = rand(year_range)
201
+ month = format("%02d", rand(1..12))
202
+ day = format("%02d", rand(1..28))
203
+ "#{year}-#{month}-#{day}"
204
+ end
205
+
206
+ def generate_url_value
207
+ "https://www.example.com"
208
+ end
209
+
210
+ def generate_default_value(key)
211
+ "fake-#{key}-#{SecureRandom.hex(4)}"
212
+ end
213
+ end
214
+ end
215
+ end