RubyGems - cv-parser - Versions diffs - 0.1.0 - Mend

cv-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +14 -0
data/LICENSE.txt +21 -0
data/README.md +380 -0
data/bin/console +11 -0
data/bin/setup +8 -0
data/exe/cv-parser +8 -0
data/lib/cv_parser/cli.rb +224 -0
data/lib/cv_parser/configuration.rb +23 -0
data/lib/cv_parser/errors.rb +14 -0
data/lib/cv_parser/extractor.rb +58 -0
data/lib/cv_parser/pdf_converter.rb +495 -0
data/lib/cv_parser/providers/anthropic.rb +249 -0
data/lib/cv_parser/providers/base.rb +119 -0
data/lib/cv_parser/providers/faker.rb +215 -0
data/lib/cv_parser/providers/openai.rb +395 -0
data/lib/cv_parser/version.rb +5 -0
data/lib/cv_parser.rb +37 -0
metadata +192 -0

data/lib/cv_parser/providers/anthropic.rb ADDED Viewed

@@ -0,0 +1,249 @@
+# frozen_string_literal: true
+require "faraday"
+require "json"
+require "mime/types"
+require "base64"
+require "faraday/multipart"
+require_relative "../pdf_converter"
+require "securerandom"
+module CvParser
+  module Providers
+    class Anthropic < Base
+      ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
+      ANTHROPIC_API_VERSION = "2023-06-01"
+      DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
+      TOOL_NAME = "extract_cv_data"
+      # HTTP Status codes
+      HTTP_OK = 200
+      HTTP_BAD_REQUEST = 400
+      HTTP_UNAUTHORIZED = 401
+      HTTP_TOO_MANY_REQUESTS = 429
+      def initialize(config)
+        super
+        @client = setup_client
+      end
+      def extract_data(output_schema:, file_path: nil)
+        validate_inputs!(output_schema, file_path)
+        processed_file_path = prepare_file(file_path)
+        base64_content = encode_file_to_base64(processed_file_path)
+        response = make_api_request(output_schema, base64_content)
+        cleanup_temp_file(processed_file_path, file_path)
+        handle_tool_response(response, output_schema)
+      rescue Faraday::Error => e
+        raise APIError, "Anthropic API connection error: #{e.message}"
+      end
+      private
+      def validate_inputs!(output_schema, file_path)
+        raise ArgumentError, "File_path must be provided" unless file_path
+        validate_schema_format!(output_schema)
+        validate_file_exists!(file_path)
+        validate_file_readable!(file_path)
+      end
+      def validate_schema_format!(output_schema)
+        return if valid_json_schema_format?(output_schema)
+        raise ArgumentError, "The Anthropic provider requires a JSON Schema format with 'type: \"json_schema\"'"
+      end
+      def valid_json_schema_format?(schema)
+        schema.is_a?(Hash) &&
+          ((schema.key?("type") && schema["type"] == "json_schema") ||
+           (schema.key?(:type) && schema[:type] == "json_schema"))
+      end
+      def prepare_file(file_path)
+        convert_to_pdf_if_needed(file_path)
+      end
+      def encode_file_to_base64(file_path)
+        pdf_content = File.read(file_path)
+        Base64.strict_encode64(pdf_content)
+      end
+      def make_api_request(output_schema, base64_content)
+        extraction_tool = build_extraction_tool(output_schema)
+        @client.post do |req|
+          req.headers["Content-Type"] = "application/json"
+          req.body = build_request_body(output_schema, extraction_tool, base64_content).to_json
+        end
+      end
+      def build_request_body(output_schema, extraction_tool, base64_content)
+        {
+          model: @config.model || DEFAULT_MODEL,
+          max_tokens: @config.max_tokens,
+          temperature: @config.temperature,
+          system: build_system_prompt,
+          tools: [extraction_tool],
+          tool_choice: { type: "tool", name: TOOL_NAME },
+          messages: [build_message(output_schema, base64_content)]
+        }
+      end
+      def build_message(output_schema, base64_content)
+        {
+          role: "user",
+          content: [
+            build_document_content(base64_content),
+            build_text_content(output_schema)
+          ]
+        }
+      end
+      def build_document_content(base64_content)
+        {
+          type: "document",
+          source: {
+            type: "base64",
+            media_type: "application/pdf",
+            data: base64_content
+          }
+        }
+      end
+      def build_text_content(output_schema)
+        {
+          type: "text",
+          text: build_extraction_prompt(output_schema)
+        }
+      end
+      def build_extraction_tool(output_schema)
+        json_schema = normalize_schema_to_json_schema(output_schema)
+        {
+          name: TOOL_NAME,
+          description: "Extract structured data from a CV/resume document according to the provided schema. Always use this tool to return the extracted data in the exact format specified by the schema.",
+          input_schema: json_schema
+        }
+      end
+      def normalize_schema_to_json_schema(schema)
+        # Extract the properties from the JSON Schema format
+        properties = extract_properties_from_schema(schema)
+        required = extract_required_from_schema(schema)
+        {
+          type: "object",
+          properties: properties,
+          required: required
+        }
+      end
+      def extract_properties_from_schema(schema)
+        if schema.key?("properties")
+          schema["properties"]
+        elsif schema.key?(:properties)
+          schema[:properties]
+        else
+          {}
+        end
+      end
+      def extract_required_from_schema(schema)
+        if schema.key?("required")
+          schema["required"]
+        elsif schema.key?(:required)
+          schema[:required]
+        else
+          []
+        end
+      end
+      def handle_tool_response(response, _schema)
+        case response.status
+        when HTTP_OK
+          extract_tool_data_from_response(response)
+        when HTTP_TOO_MANY_REQUESTS
+          handle_rate_limit_error(response)
+        when HTTP_UNAUTHORIZED
+          raise AuthenticationError, "Invalid API key or unauthorized access"
+        when HTTP_BAD_REQUEST
+          handle_bad_request_error(response)
+        else
+          handle_generic_api_error(response)
+        end
+      rescue JSON::ParserError => e
+        raise ParseError, "Failed to parse Anthropic response as JSON: #{e.message}"
+      end
+      def extract_tool_data_from_response(response)
+        response_body = response.body
+        content = response_body["content"]
+        raise ParseError, "Unexpected Anthropic response format: content is not an array" unless content.is_a?(Array)
+        tool_use_block = find_tool_use_block(content)
+        validate_tool_use_block(tool_use_block)
+        extracted_data = tool_use_block["input"]
+        raise ParseError, "Tool input is not a hash/object as expected" unless extracted_data.is_a?(Hash)
+        extracted_data
+      end
+      def find_tool_use_block(content)
+        content.find { |block| block["type"] == "tool_use" }
+      end
+      def validate_tool_use_block(tool_use_block)
+        raise ParseError, "No tool_use block found in Claude's response" unless tool_use_block
+        return if tool_use_block["name"] == TOOL_NAME
+        raise ParseError, "Unexpected tool used: #{tool_use_block["name"]}"
+      end
+      def handle_rate_limit_error(response)
+        retry_after = response.headers["retry-after"]
+        message = retry_after ? "Rate limit exceeded, retry after #{retry_after} seconds" : "Rate limit exceeded"
+        raise RateLimitError, message
+      end
+      def handle_bad_request_error(response)
+        error_message = response.body.dig("error", "message") || "Bad request"
+        raise InvalidRequestError, "Anthropic API error: #{error_message}"
+      end
+      def handle_generic_api_error(response)
+        error_type = response.body.dig("error", "type") || "unknown"
+        error_message = response.body.dig("error", "message") || "Unknown error"
+        raise APIError, "Anthropic API error: #{response.status} - #{error_type} - #{error_message}"
+      end
+      protected
+      # Sets up the Faraday HTTP client with proper headers and configuration
+      def setup_http_client
+        super
+        @base_headers["x-api-key"] = @api_key
+        @base_headers["anthropic-version"] = ANTHROPIC_API_VERSION
+      end
+      # Configures and returns a Faraday client instance
+      def setup_client
+        Faraday.new(url: ANTHROPIC_API_URL) do |f|
+          f.options.timeout = @timeout
+          f.request :json
+          f.response :json
+          f.adapter Faraday.default_adapter
+          @base_headers.each { |key, value| f.headers[key] = value }
+        end
+      end
+    end
+  end
+end

data/lib/cv_parser/providers/base.rb ADDED Viewed

@@ -0,0 +1,119 @@
+# frozen_string_literal: true
+require "securerandom"
+require_relative "../pdf_converter"
+module CvParser
+  module Providers
+    # Base class for CV parsing providers that defines the common interface
+    # and shared functionality for extracting structured data from CV files.
+    class Base
+      def initialize(config)
+        @config = config
+        @pdf_converter = CvParser::PdfConverter.new
+        setup_http_client
+      end
+      def extract_data(output_schema:, file_path: nil)
+        raise NotImplementedError, "Subclasses must implement extract_data"
+      end
+      def upload_file(file_path)
+        raise NotImplementedError, "Subclasses must implement upload_file"
+      end
+      protected
+      def setup_http_client
+        @api_key = @config.api_key
+        @timeout = @config.timeout || 60
+        @base_headers = {
+          "User-Agent" => "cv-parser-ruby/#{CvParser::VERSION}",
+          **@config.provider_options.fetch(:headers, {})
+        }
+      end
+      def convert_to_pdf_if_needed(file_path)
+        file_ext = File.extname(file_path).downcase
+        case file_ext
+        when ".docx"
+          # Generate a temporary PDF file path
+          temp_pdf_path = File.join(
+            File.dirname(file_path),
+            "#{File.basename(file_path, file_ext)}_converted_#{SecureRandom.hex(8)}.pdf"
+          )
+          # Convert DOCX to PDF
+          @pdf_converter.convert(file_path, temp_pdf_path)
+          temp_pdf_path
+        when ".pdf"
+          # Already a PDF, return as-is
+          file_path
+        else
+          # For other file types, let the provider handle them directly
+          file_path
+        end
+      rescue StandardError => e
+        raise APIError, "Failed to convert DOCX to PDF: #{e.message}"
+      end
+      def cleanup_temp_file(processed_file_path, original_file_path)
+        # Only delete if we created a temporary converted file
+        if processed_file_path != original_file_path && File.exist?(processed_file_path)
+          File.delete(processed_file_path)
+        end
+      rescue StandardError => e
+        # Log the error but don't fail the main operation
+        warn "Warning: Failed to cleanup temporary file #{processed_file_path}: #{e.message}"
+      end
+      def build_extraction_prompt(schema = nil)
+        default_prompt = <<~PROMPT
+          Extract structured information from the attached CV/Resume as JSON.
+          Instructions:
+          1. Extract all the requested fields from the CV.
+          2. Maintain the exact structure defined in the schema.
+          3. If information for a field is not available, use null or empty arrays as appropriate.
+          4. For dates, use the format provided in the CV.
+          5. Return only raw JSON without any markdown formatting, code blocks, or additional explanations.
+          6. Do not prefix your response with ```json or any other markdown syntax.
+          7. Start your response with the opening curly brace { and end with the closing curly brace }.
+        PROMPT
+        prompt = @config.prompt || default_prompt
+        if schema
+          prompt += <<~SCHEMA
+            The output should be formatted as JSON with the following schema:
+            #{schema.to_json}
+          SCHEMA
+        end
+        prompt
+      end
+      def build_system_prompt
+        return @config.system_prompt if @config.system_prompt
+        <<~PROMPT
+          You are a CV parsing assistant. Extract structured information from the attached CV/Resume.
+        PROMPT
+      end
+      def validate_file_exists!(file_path)
+        return if File.exist?(file_path)
+        raise FileNotFoundError, "File not found: #{file_path}"
+      end
+      def validate_file_readable!(file_path)
+        return if File.readable?(file_path)
+        raise FileNotReadableError, "File not readable: #{file_path}"
+      end
+    end
+  end
+end

data/lib/cv_parser/providers/faker.rb ADDED Viewed

@@ -0,0 +1,215 @@
+# frozen_string_literal: true
+require "securerandom"
+module CvParser
+  module Providers
+    class Faker < Base
+      # Sample data constants
+      SKILLS = ["Ruby", "JavaScript", "Python", "React", "Java", "C#", "PHP", "Go", "Swift", "Kotlin"].freeze
+      JOB_TITLES = ["Software Engineer", "Full Stack Developer", "DevOps Engineer", "Data Scientist",
+                    "Product Manager", "UX Designer", "Frontend Developer", "Backend Developer"].freeze
+      COMPANIES = %w[Google Microsoft Amazon Facebook Apple Netflix Tesla Airbnb].freeze
+      UNIVERSITIES = ["Stanford University", "MIT", "Harvard", "Berkeley", "Oxford", "Cambridge"].freeze
+      DEGREES = ["Bachelor of Science", "Master of Science", "PhD", "MBA"].freeze
+      MAJORS = ["Computer Science", "Software Engineering", "Electrical Engineering", "Data Science"].freeze
+      LANGUAGES = %w[English Spanish French German Mandarin].freeze
+      # Date ranges
+      START_YEAR_RANGE = (2010..2020).freeze
+      END_YEAR_RANGE = (2015..2022).freeze
+      GENERAL_YEAR_RANGE = (2010..2023).freeze
+      # Array size range
+      ARRAY_SIZE_RANGE = (1..3).freeze
+      # Schema types
+      JSON_SCHEMA_TYPE = "json_schema"
+      def extract_data(output_schema:, file_path: nil)
+        validate_schema_format!(output_schema)
+        generate_fake_data(output_schema)
+      end
+      def upload_file(file_path)
+        # No-op for faker provider
+        { id: "fake-file-#{SecureRandom.hex(8)}" }
+      end
+      private
+      def validate_schema_format!(output_schema)
+        return if valid_json_schema_format?(output_schema)
+        raise ArgumentError, "The Faker provider requires a JSON Schema format with 'type: \"json_schema\"'"
+      end
+      def valid_json_schema_format?(schema)
+        schema.is_a?(Hash) &&
+          ((schema.key?("type") && schema["type"] == JSON_SCHEMA_TYPE) ||
+           (schema.key?(:type) && schema[:type] == JSON_SCHEMA_TYPE))
+      end
+      def generate_fake_data(schema)
+        return generate_fake_data_from_json_schema(schema) if json_schema_format?(schema)
+        return generate_fake_data_from_hash(schema) if schema.is_a?(Hash)
+        return generate_fake_data_from_array(schema) if schema.is_a?(Array) && !schema.empty?
+        "fake-value"
+      end
+      def json_schema_format?(schema)
+        schema.is_a?(Hash) &&
+          ((schema.key?("type") && schema["type"] == JSON_SCHEMA_TYPE) ||
+           (schema.key?(:type) && schema[:type] == JSON_SCHEMA_TYPE))
+      end
+      def generate_fake_data_from_json_schema(schema)
+        properties = extract_properties_from_schema(schema)
+        generate_fake_data_from_properties(properties)
+      end
+      def extract_properties_from_schema(schema)
+        schema["properties"] || schema[:properties] || {}
+      end
+      def generate_fake_data_from_hash(schema)
+        result = {}
+        schema.each do |key, type|
+          result[key.to_s] = generate_value_for_type(type, key)
+        end
+        result
+      end
+      def generate_fake_data_from_array(schema)
+        count = rand(ARRAY_SIZE_RANGE)
+        Array.new(count) { generate_fake_data(schema.first) }
+      end
+      def generate_fake_data_from_properties(properties)
+        result = {}
+        properties.each do |key, type|
+          result[key.to_s] = generate_value_for_type(type, key)
+        end
+        result
+      end
+      def generate_value_for_type(type, key)
+        return generate_value_from_typed_hash(type, key) if typed_hash?(type)
+        return generate_fake_data(type) if type.is_a?(Hash)
+        return generate_fake_data_from_array(type) if type.is_a?(Array)
+        generate_string_value(key, nil)
+      end
+      def typed_hash?(type)
+        type.is_a?(Hash) && (type.key?("type") || type.key?(:type))
+      end
+      def generate_value_from_typed_hash(type, key)
+        schema_type = type["type"] || type[:type]
+        description = type["description"] || type[:description]
+        case schema_type
+        when "object"
+          properties = type["properties"] || type[:properties] || {}
+          generate_fake_data(properties)
+        when "array"
+          items = type["items"] || type[:items]
+          count = rand(ARRAY_SIZE_RANGE)
+          Array.new(count) { generate_value_for_type(items, key) }
+        when "string"
+          generate_string_value(key, description)
+        when "number", "integer"
+          rand(1..100)
+        when "boolean"
+          [true, false].sample
+        else
+          "fake-value"
+        end
+      end
+      def generate_string_value(key, description = nil)
+        key_string = key.to_s.downcase
+        case key_string
+        when /name/
+          generate_name_value
+        when /email/
+          generate_email_value
+        when /phone/
+          generate_phone_value
+        when /address/
+          generate_address_value
+        when /summary/, /objective/, /description/
+          generate_description_value
+        when /skill/
+          SKILLS.sample
+        when /title/, /position/, /role/
+          JOB_TITLES.sample
+        when /company/, /employer/, /organization/
+          COMPANIES.sample
+        when /university/, /school/, /college/
+          UNIVERSITIES.sample
+        when /degree/
+          DEGREES.sample
+        when /major/, /field/
+          MAJORS.sample
+        when /year/, /years/
+          generate_years_value
+        when /start_date/, /start/
+          generate_date_value(START_YEAR_RANGE)
+        when /end_date/, /end/
+          generate_date_value(END_YEAR_RANGE)
+        when /date/
+          generate_date_value(GENERAL_YEAR_RANGE)
+        when /url/, /website/, /link/
+          generate_url_value
+        when /language/
+          LANGUAGES.sample
+        else
+          generate_default_value(key)
+        end
+      end
+      def generate_name_value
+        "John Doe"
+      end
+      def generate_email_value
+        "john.doe@example.com"
+      end
+      def generate_phone_value
+        "+1 (555) 123-4567"
+      end
+      def generate_address_value
+        "123 Main St, Anytown, CA 94088"
+      end
+      def generate_description_value
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+      end
+      def generate_years_value
+        rand(1..10).to_s
+      end
+      def generate_date_value(year_range)
+        year = rand(year_range)
+        month = format("%02d", rand(1..12))
+        day = format("%02d", rand(1..28))
+        "#{year}-#{month}-#{day}"
+      end
+      def generate_url_value
+        "https://www.example.com"
+      end
+      def generate_default_value(key)
+        "fake-#{key}-#{SecureRandom.hex(4)}"
+      end
+    end
+  end
+end