aven 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +35 -0
- data/Rakefile +19 -0
- data/app/assets/stylesheets/aven/application.css +14 -0
- data/app/assets/stylesheets/aven/application.tailwind.css +7 -0
- data/app/assets/stylesheets/aven/tailwind.css +224 -0
- data/app/channels/aven/chat/thread_channel.rb +39 -0
- data/app/components/aven/application_view_component.rb +15 -0
- data/app/components/aven/views/admin/dashboard/index/component.html.erb +1 -0
- data/app/components/aven/views/admin/dashboard/index/component.rb +5 -0
- data/app/components/aven/views/articles/edit/component.html.erb +14 -0
- data/app/components/aven/views/articles/edit/component.rb +14 -0
- data/app/components/aven/views/articles/form/component.html.erb +45 -0
- data/app/components/aven/views/articles/form/component.rb +27 -0
- data/app/components/aven/views/articles/index/component.html.erb +93 -0
- data/app/components/aven/views/articles/index/component.rb +29 -0
- data/app/components/aven/views/articles/new/component.html.erb +13 -0
- data/app/components/aven/views/articles/new/component.rb +14 -0
- data/app/components/aven/views/articles/show/component.html.erb +110 -0
- data/app/components/aven/views/articles/show/component.rb +34 -0
- data/app/components/aven/views/oauth/error/component.html.erb +44 -0
- data/app/components/aven/views/oauth/error/component.rb +30 -0
- data/app/components/aven/views/static/index/component.html.erb +17 -0
- data/app/components/aven/views/static/index/component.rb +16 -0
- data/app/components/aven/views/static/index/controller.js +7 -0
- data/app/controllers/aven/admin/base.rb +16 -0
- data/app/controllers/aven/admin/dashboard_controller.rb +9 -0
- data/app/controllers/aven/agentic/agents_controller.rb +56 -0
- data/app/controllers/aven/agentic/documents_controller.rb +51 -0
- data/app/controllers/aven/agentic/mcp_controller.rb +124 -0
- data/app/controllers/aven/agentic/tools_controller.rb +37 -0
- data/app/controllers/aven/ai/text_controller.rb +41 -0
- data/app/controllers/aven/application_controller.rb +27 -0
- data/app/controllers/aven/articles_controller.rb +114 -0
- data/app/controllers/aven/auth_controller.rb +12 -0
- data/app/controllers/aven/chat/threads_controller.rb +67 -0
- data/app/controllers/aven/oauth/auth0_controller.rb +84 -0
- data/app/controllers/aven/oauth/base_controller.rb +183 -0
- data/app/controllers/aven/oauth/documentation/auth0.md +387 -0
- data/app/controllers/aven/oauth/documentation/entra_id.md +608 -0
- data/app/controllers/aven/oauth/documentation/github.md +329 -0
- data/app/controllers/aven/oauth/documentation/google.md +253 -0
- data/app/controllers/aven/oauth/entra_id_controller.rb +92 -0
- data/app/controllers/aven/oauth/github_controller.rb +91 -0
- data/app/controllers/aven/oauth/google_controller.rb +64 -0
- data/app/controllers/aven/static_controller.rb +7 -0
- data/app/controllers/aven/tags_controller.rb +44 -0
- data/app/controllers/aven/workspaces_controller.rb +20 -0
- data/app/controllers/concerns/aven/authentication.rb +49 -0
- data/app/controllers/concerns/aven/controller_helpers.rb +38 -0
- data/app/helpers/aven/application_helper.rb +16 -0
- data/app/javascript/aven/application.js +3 -0
- data/app/javascript/aven/controllers/application.js +5 -0
- data/app/javascript/aven/controllers/index.js +11 -0
- data/app/jobs/aven/agentic/document_embedding_job.rb +28 -0
- data/app/jobs/aven/agentic/document_ocr_job.rb +28 -0
- data/app/jobs/aven/application_job.rb +4 -0
- data/app/jobs/aven/chat/calculate_cost_job.rb +26 -0
- data/app/jobs/aven/chat/run_job.rb +27 -0
- data/app/mailers/aven/application_mailer.rb +6 -0
- data/app/models/aven/agentic/agent.rb +76 -0
- data/app/models/aven/agentic/agent_document.rb +37 -0
- data/app/models/aven/agentic/agent_tool.rb +37 -0
- data/app/models/aven/agentic/document.rb +162 -0
- data/app/models/aven/agentic/document_embedding.rb +39 -0
- data/app/models/aven/agentic/tool.rb +106 -0
- data/app/models/aven/agentic/tool_parameter.rb +56 -0
- data/app/models/aven/application_record.rb +5 -0
- data/app/models/aven/article.rb +86 -0
- data/app/models/aven/article_attachment.rb +18 -0
- data/app/models/aven/article_relationship.rb +26 -0
- data/app/models/aven/chat/message.rb +135 -0
- data/app/models/aven/chat/thread.rb +159 -0
- data/app/models/aven/import/entry.rb +45 -0
- data/app/models/aven/import/item_link.rb +36 -0
- data/app/models/aven/import/processor.rb +123 -0
- data/app/models/aven/import.rb +102 -0
- data/app/models/aven/item/embed.rb +54 -0
- data/app/models/aven/item/embeddable.rb +141 -0
- data/app/models/aven/item/linkable.rb +212 -0
- data/app/models/aven/item/schema/builder.rb +139 -0
- data/app/models/aven/item/schemaed.rb +252 -0
- data/app/models/aven/item/schemas/base.rb +108 -0
- data/app/models/aven/item.rb +128 -0
- data/app/models/aven/item_link.rb +43 -0
- data/app/models/aven/item_schema.rb +87 -0
- data/app/models/aven/log.rb +66 -0
- data/app/models/aven/loggable.rb +20 -0
- data/app/models/aven/user.rb +40 -0
- data/app/models/aven/workspace.rb +93 -0
- data/app/models/aven/workspace_role.rb +46 -0
- data/app/models/aven/workspace_user.rb +54 -0
- data/app/models/aven/workspace_user_role.rb +38 -0
- data/app/models/concerns/aven/agentic/document_embeddable.rb +58 -0
- data/app/models/concerns/aven/searchable.rb +61 -0
- data/app/services/aven/agentic/dynamic_tool_builder.rb +81 -0
- data/app/services/aven/agentic/mcp/adapter.rb +77 -0
- data/app/services/aven/agentic/mcp/result_formatter.rb +57 -0
- data/app/services/aven/agentic/mcp/server_factory.rb +43 -0
- data/app/services/aven/agentic/ocr/base_extractor.rb +39 -0
- data/app/services/aven/agentic/ocr/excel_extractor.rb +43 -0
- data/app/services/aven/agentic/ocr/image_extractor.rb +22 -0
- data/app/services/aven/agentic/ocr/pdf_extractor.rb +48 -0
- data/app/services/aven/agentic/ocr/processor.rb +36 -0
- data/app/services/aven/agentic/ocr/textract_client.rb +131 -0
- data/app/services/aven/agentic/ocr/word_extractor.rb +34 -0
- data/app/services/aven/agentic/tool_result_formatter.rb +76 -0
- data/app/services/aven/agentic/tools/base.rb +55 -0
- data/app/services/aven/agentic/tools/concerns/boolean_filtering.rb +40 -0
- data/app/services/aven/agentic/tools/concerns/enum_filtering.rb +47 -0
- data/app/services/aven/agentic/tools/concerns/geo_filtering.rb +56 -0
- data/app/services/aven/agentic/tools/concerns/range_filtering.rb +51 -0
- data/app/services/aven/chat/broadcaster.rb +59 -0
- data/app/services/aven/chat/config.rb +93 -0
- data/app/services/aven/chat/message_builder.rb +42 -0
- data/app/services/aven/chat/orchestrator.rb +69 -0
- data/app/services/aven/chat/runner.rb +105 -0
- data/app/services/aven/chat/title_generator.rb +61 -0
- data/app/services/aven/external/gmail_client.rb +173 -0
- data/app/services/aven/external/google_contacts_client.rb +95 -0
- data/app/views/layouts/aven/admin.html.erb +16 -0
- data/app/views/layouts/aven/application.html.erb +18 -0
- data/config/importmap.rb +16 -0
- data/config/routes.rb +63 -0
- data/db/migrate/20200101000001_create_aven_users.rb +19 -0
- data/db/migrate/20200101000002_create_aven_workspaces.rb +14 -0
- data/db/migrate/20200101000003_create_aven_workspace_users.rb +12 -0
- data/db/migrate/20200101000004_create_aven_workspace_roles.rb +13 -0
- data/db/migrate/20200101000005_create_aven_workspace_user_roles.rb +12 -0
- data/db/migrate/20200101000006_create_aven_logs.rb +21 -0
- data/db/migrate/20200101000009_create_aven_items.rb +17 -0
- data/db/migrate/20200101000010_create_aven_item_links.rb +17 -0
- data/db/migrate/20200101000011_create_aven_agentic_tools.rb +19 -0
- data/db/migrate/20200101000012_create_aven_agentic_tool_parameters.rb +20 -0
- data/db/migrate/20200101000013_create_aven_agentic_documents.rb +22 -0
- data/db/migrate/20200101000014_create_aven_agentic_document_embeddings.rb +18 -0
- data/db/migrate/20200101000015_create_aven_agentic_agents.rb +18 -0
- data/db/migrate/20200101000016_create_aven_agentic_agent_tools.rb +13 -0
- data/db/migrate/20200101000017_create_aven_agentic_agent_documents.rb +13 -0
- data/db/migrate/20200101000018_create_aven_chat_threads.rb +19 -0
- data/db/migrate/20200101000019_create_aven_chat_messages.rb +26 -0
- data/db/migrate/20200101000020_add_pg_search_support.rb +21 -0
- data/db/migrate/20200101000021_create_aven_item_schemas.rb +18 -0
- data/db/migrate/20200101000022_create_aven_imports.rb +23 -0
- data/db/migrate/20200101000023_create_aven_import_entries.rb +13 -0
- data/db/migrate/20200101000024_create_aven_import_item_links.rb +13 -0
- data/db/migrate/20200101000025_create_aven_articles.rb +19 -0
- data/db/migrate/20200101000026_create_aven_article_attachments.rb +13 -0
- data/db/migrate/20200101000027_create_aven_article_relationships.rb +15 -0
- data/lib/aven/configuration.rb +87 -0
- data/lib/aven/engine.rb +43 -0
- data/lib/aven/model/tenant_model.rb +91 -0
- data/lib/aven/model.rb +6 -0
- data/lib/aven/version.rb +3 -0
- data/lib/aven.rb +8 -0
- data/lib/tasks/annotate_rb.rake +10 -0
- data/lib/tasks/aven_tasks.rake +21 -0
- metadata +426 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Searchable
|
|
5
|
+
extend ActiveSupport::Concern
|
|
6
|
+
|
|
7
|
+
included do
|
|
8
|
+
include PgSearch::Model
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
class_methods do
|
|
12
|
+
# DSL for configuring search
|
|
13
|
+
#
|
|
14
|
+
# @example Basic usage
|
|
15
|
+
# class Product < ApplicationRecord
|
|
16
|
+
# include Aven::Searchable
|
|
17
|
+
#
|
|
18
|
+
# searchable against: [:name, :description]
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
# @example With options
|
|
22
|
+
# searchable against: [:name, :description],
|
|
23
|
+
# using: { tsearch: { prefix: true, dictionary: "english" } },
|
|
24
|
+
# ranked_by: ":tsearch"
|
|
25
|
+
#
|
|
26
|
+
# @example With associations
|
|
27
|
+
# searchable against: [:title],
|
|
28
|
+
# associated_against: { author: :name, tags: :label }
|
|
29
|
+
#
|
|
30
|
+
# @example With tsvector column (for performance)
|
|
31
|
+
# searchable against: [:name, :description],
|
|
32
|
+
# using: { tsearch: { tsvector_column: "searchable" } }
|
|
33
|
+
#
|
|
34
|
+
def searchable(against:, **options)
|
|
35
|
+
pg_search_scope(:search,
|
|
36
|
+
against:,
|
|
37
|
+
**options.reverse_merge(
|
|
38
|
+
using: {
|
|
39
|
+
tsearch: { prefix: true, negation: true }
|
|
40
|
+
}
|
|
41
|
+
))
|
|
42
|
+
|
|
43
|
+
# Convenience scope for workspace + search
|
|
44
|
+
if column_names.include?("workspace_id")
|
|
45
|
+
scope :search_in_workspace, ->(workspace, query) {
|
|
46
|
+
where(workspace_id: workspace.id).search(query)
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Multi-search registration (global search across models)
|
|
52
|
+
#
|
|
53
|
+
# @example
|
|
54
|
+
# searchable_globally against: [:name, :description]
|
|
55
|
+
#
|
|
56
|
+
def searchable_globally(against:, **options)
|
|
57
|
+
multisearchable against:, **options
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
class DynamicToolBuilder
|
|
6
|
+
class << self
|
|
7
|
+
# Build a RubyLLM tool from a database record
|
|
8
|
+
# @param tool_record [Aven::Agentic::Tool] Database record
|
|
9
|
+
# @return [Class] RubyLLM::Tool subclass
|
|
10
|
+
def build(tool_record)
|
|
11
|
+
return nil unless tool_record.valid_class?
|
|
12
|
+
|
|
13
|
+
tool_class = tool_record.tool_class
|
|
14
|
+
tool_description = tool_record.effective_description
|
|
15
|
+
tool_parameters = tool_record.parameters.to_a
|
|
16
|
+
tool_name = tool_record.name
|
|
17
|
+
|
|
18
|
+
# Create dynamic RubyLLM tool class
|
|
19
|
+
Class.new(RubyLLM::Tool) do
|
|
20
|
+
@search_tool_class = tool_class
|
|
21
|
+
@tool_record = tool_record
|
|
22
|
+
@tool_name = tool_name
|
|
23
|
+
|
|
24
|
+
class << self
|
|
25
|
+
attr_reader :search_tool_class, :tool_record, :tool_name
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
description tool_description
|
|
29
|
+
|
|
30
|
+
# Build parameters using param DSL
|
|
31
|
+
tool_parameters.each do |p|
|
|
32
|
+
param_name = p.name.to_sym
|
|
33
|
+
param_desc = p.effective_description
|
|
34
|
+
param_type = case p.param_type.to_sym
|
|
35
|
+
when :integer, :float then :number
|
|
36
|
+
when :array then :array
|
|
37
|
+
when :boolean then :boolean
|
|
38
|
+
else :string
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
param param_name, type: param_type, desc: param_desc, required: p.required?
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Override name instance method
|
|
45
|
+
def name
|
|
46
|
+
self.class.tool_name
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Execute delegates to actual tool class
|
|
50
|
+
def execute(**params)
|
|
51
|
+
result = self.class.search_tool_class.call(**params)
|
|
52
|
+
Aven::Agentic::ToolResultFormatter.format(self.class.tool_name, result)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Build all enabled tools for a workspace
|
|
58
|
+
# @param workspace [Aven::Workspace, nil] Workspace to scope tools
|
|
59
|
+
# @return [Array<Class>] Array of RubyLLM::Tool subclasses
|
|
60
|
+
def build_all(workspace: nil)
|
|
61
|
+
scope = Aven::Agentic::Tool.enabled.includes(:parameters)
|
|
62
|
+
scope = scope.for_workspace(workspace) if workspace
|
|
63
|
+
|
|
64
|
+
scope.map { |tool_record| build(tool_record) }.compact
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Get cached tool or build fresh
|
|
68
|
+
def cached_build(tool_record)
|
|
69
|
+
@tool_cache ||= {}
|
|
70
|
+
cache_key = "#{tool_record.id}/#{tool_record.updated_at.to_f}"
|
|
71
|
+
|
|
72
|
+
@tool_cache[cache_key] ||= build(tool_record)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def clear_cache!
|
|
76
|
+
@tool_cache = {}
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Mcp
|
|
6
|
+
class Adapter
|
|
7
|
+
def initialize(tool_class, context = {})
|
|
8
|
+
@tool_class = tool_class
|
|
9
|
+
@context = context
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Convert tool to MCP tool format
|
|
13
|
+
def to_mcp_tool
|
|
14
|
+
{
|
|
15
|
+
name: tool_name,
|
|
16
|
+
description: tool_description,
|
|
17
|
+
input_schema: build_input_schema,
|
|
18
|
+
handler: method(:handle)
|
|
19
|
+
}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Handle MCP tool call
|
|
23
|
+
def handle(params)
|
|
24
|
+
result = @tool_class.new.execute(**params.symbolize_keys)
|
|
25
|
+
ResultFormatter.format(result)
|
|
26
|
+
rescue => e
|
|
27
|
+
Rails.logger.error("[Aven::MCP] Tool execution error: #{e.message}")
|
|
28
|
+
ResultFormatter.format_error(e)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def tool_name
|
|
34
|
+
@tool_class.tool_name
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def tool_description
|
|
38
|
+
@tool_class.class.tool_record&.effective_description || "No description"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def build_input_schema
|
|
42
|
+
properties = {}
|
|
43
|
+
required = []
|
|
44
|
+
|
|
45
|
+
tool_record = @tool_class.class.tool_record
|
|
46
|
+
return { type: "object", properties: {} } unless tool_record
|
|
47
|
+
|
|
48
|
+
tool_record.parameters.each do |param|
|
|
49
|
+
properties[param.name] = {
|
|
50
|
+
type: json_schema_type(param.param_type),
|
|
51
|
+
description: param.effective_description
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
required << param.name if param.required?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
{
|
|
58
|
+
type: "object",
|
|
59
|
+
properties:,
|
|
60
|
+
required:
|
|
61
|
+
}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def json_schema_type(param_type)
|
|
65
|
+
case param_type.to_sym
|
|
66
|
+
when :integer then "integer"
|
|
67
|
+
when :float then "number"
|
|
68
|
+
when :boolean then "boolean"
|
|
69
|
+
when :array then "array"
|
|
70
|
+
when :object then "object"
|
|
71
|
+
else "string"
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Mcp
|
|
6
|
+
class ResultFormatter
|
|
7
|
+
class << self
|
|
8
|
+
# Format successful result for MCP response
|
|
9
|
+
# @param result [Object] Tool execution result
|
|
10
|
+
# @return [Hash] MCP-formatted result
|
|
11
|
+
def format(result)
|
|
12
|
+
{
|
|
13
|
+
content: [
|
|
14
|
+
{
|
|
15
|
+
type: "text",
|
|
16
|
+
text: format_content(result)
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Format error for MCP response
|
|
23
|
+
# @param error [Exception] Error that occurred
|
|
24
|
+
# @return [Hash] MCP-formatted error
|
|
25
|
+
def format_error(error)
|
|
26
|
+
{
|
|
27
|
+
content: [
|
|
28
|
+
{
|
|
29
|
+
type: "text",
|
|
30
|
+
text: "Error: #{error.message}"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
isError: true
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def format_content(result)
|
|
40
|
+
case result
|
|
41
|
+
when String
|
|
42
|
+
result
|
|
43
|
+
when Hash
|
|
44
|
+
JSON.pretty_generate(result)
|
|
45
|
+
when Array
|
|
46
|
+
result.map { |item| format_content(item) }.join("\n\n")
|
|
47
|
+
when nil
|
|
48
|
+
"No result"
|
|
49
|
+
else
|
|
50
|
+
result.to_s
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Mcp
|
|
6
|
+
class ServerFactory
|
|
7
|
+
SERVER_NAME = "aven-mcp-server"
|
|
8
|
+
SERVER_VERSION = Aven::VERSION
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
# Build an MCP server instance
|
|
12
|
+
# @param server_context [Hash] Context data for the server
|
|
13
|
+
# @return [MCP::Server] Configured MCP server
|
|
14
|
+
def build(server_context: {})
|
|
15
|
+
return nil unless defined?(::MCP::Server)
|
|
16
|
+
|
|
17
|
+
server = ::MCP::Server.new(
|
|
18
|
+
name: SERVER_NAME,
|
|
19
|
+
version: SERVER_VERSION
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Register tools
|
|
23
|
+
register_tools(server, server_context)
|
|
24
|
+
|
|
25
|
+
server
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def register_tools(server, context)
|
|
31
|
+
workspace = context[:workspace]
|
|
32
|
+
tools = Aven::Agentic::DynamicToolBuilder.build_all(workspace:)
|
|
33
|
+
|
|
34
|
+
tools.each do |tool_class|
|
|
35
|
+
adapter = Adapter.new(tool_class, context)
|
|
36
|
+
server.register_tool(adapter.to_mcp_tool)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Ocr
|
|
6
|
+
class BaseExtractor
|
|
7
|
+
class << self
|
|
8
|
+
# Extract text from document
|
|
9
|
+
# @param document [Aven::Agentic::Document]
|
|
10
|
+
# @return [String, nil] Extracted text
|
|
11
|
+
def extract(document)
|
|
12
|
+
raise NotImplementedError, "#{name} must implement extract"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
protected
|
|
16
|
+
|
|
17
|
+
# Download file to temp location
|
|
18
|
+
def with_tempfile(document, &block)
|
|
19
|
+
return nil unless document.file.attached?
|
|
20
|
+
|
|
21
|
+
extension = File.extname(document.filename)
|
|
22
|
+
tempfile = Tempfile.new(["aven_ocr", extension])
|
|
23
|
+
|
|
24
|
+
begin
|
|
25
|
+
tempfile.binmode
|
|
26
|
+
tempfile.write(document.file.download)
|
|
27
|
+
tempfile.rewind
|
|
28
|
+
|
|
29
|
+
yield tempfile.path
|
|
30
|
+
ensure
|
|
31
|
+
tempfile.close
|
|
32
|
+
tempfile.unlink
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Ocr
|
|
6
|
+
class ExcelExtractor < BaseExtractor
|
|
7
|
+
class << self
|
|
8
|
+
def extract(document)
|
|
9
|
+
with_tempfile(document) do |path|
|
|
10
|
+
extract_xlsx(path)
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
private
|
|
15
|
+
|
|
16
|
+
def extract_xlsx(path)
|
|
17
|
+
# Use roo gem if available
|
|
18
|
+
if defined?(Roo::Spreadsheet)
|
|
19
|
+
xlsx = Roo::Spreadsheet.open(path)
|
|
20
|
+
sheets = []
|
|
21
|
+
|
|
22
|
+
xlsx.sheets.each do |sheet_name|
|
|
23
|
+
sheet = xlsx.sheet(sheet_name)
|
|
24
|
+
rows = sheet.each.map do |row|
|
|
25
|
+
row.map { |cell| cell.to_s.strip }.join("\t")
|
|
26
|
+
end
|
|
27
|
+
sheets << "## #{sheet_name}\n\n#{rows.join("\n")}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
sheets.join("\n\n---\n\n")
|
|
31
|
+
else
|
|
32
|
+
Rails.logger.warn("[Aven::OCR] roo gem not available")
|
|
33
|
+
nil
|
|
34
|
+
end
|
|
35
|
+
rescue => e
|
|
36
|
+
Rails.logger.error("[Aven::OCR] Excel extraction failed: #{e.message}")
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Ocr
|
|
6
|
+
class ImageExtractor < BaseExtractor
|
|
7
|
+
class << self
|
|
8
|
+
def extract(document)
|
|
9
|
+
with_tempfile(document) do |path|
|
|
10
|
+
if Aven.configuration.ocr&.provider == :textract
|
|
11
|
+
TextractClient.extract_document(path)
|
|
12
|
+
else
|
|
13
|
+
Rails.logger.warn("[Aven::OCR] No OCR provider configured for images")
|
|
14
|
+
nil
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Ocr
|
|
6
|
+
class PdfExtractor < BaseExtractor
|
|
7
|
+
class << self
|
|
8
|
+
def extract(document)
|
|
9
|
+
with_tempfile(document) do |path|
|
|
10
|
+
# Try text extraction first (for text-based PDFs)
|
|
11
|
+
text = extract_text_layer(path)
|
|
12
|
+
return text if text.present?
|
|
13
|
+
|
|
14
|
+
# Fall back to OCR for scanned PDFs
|
|
15
|
+
extract_with_ocr(path)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def extract_text_layer(path)
|
|
22
|
+
# Use pdf-reader gem if available
|
|
23
|
+
if defined?(PDF::Reader)
|
|
24
|
+
reader = PDF::Reader.new(path)
|
|
25
|
+
text = reader.pages.map(&:text).join("\n\n")
|
|
26
|
+
return text if text.strip.present?
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
nil
|
|
30
|
+
rescue => e
|
|
31
|
+
Rails.logger.warn("[Aven::OCR] PDF text extraction failed: #{e.message}")
|
|
32
|
+
nil
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def extract_with_ocr(path)
|
|
36
|
+
# Use AWS Textract if configured
|
|
37
|
+
if Aven.configuration.ocr&.provider == :textract
|
|
38
|
+
TextractClient.extract_document(path)
|
|
39
|
+
else
|
|
40
|
+
Rails.logger.warn("[Aven::OCR] No OCR provider configured")
|
|
41
|
+
nil
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Ocr
|
|
6
|
+
class Processor
|
|
7
|
+
class << self
|
|
8
|
+
# Process a document and extract text content
|
|
9
|
+
# @param document [Aven::Agentic::Document]
|
|
10
|
+
# @return [String, nil] Extracted text content
|
|
11
|
+
def process(document)
|
|
12
|
+
extractor = extractor_for(document)
|
|
13
|
+
return nil unless extractor
|
|
14
|
+
|
|
15
|
+
extractor.extract(document)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def extractor_for(document)
|
|
21
|
+
case
|
|
22
|
+
when document.pdf?
|
|
23
|
+
PdfExtractor
|
|
24
|
+
when document.image?
|
|
25
|
+
ImageExtractor
|
|
26
|
+
when document.word_doc?
|
|
27
|
+
WordExtractor
|
|
28
|
+
when document.excel?
|
|
29
|
+
ExcelExtractor
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Ocr
|
|
6
|
+
class TextractClient
|
|
7
|
+
class << self
|
|
8
|
+
# Extract text from document using AWS Textract
|
|
9
|
+
# @param file_path [String] Path to the file
|
|
10
|
+
# @return [String, nil] Extracted text
|
|
11
|
+
def extract_document(file_path)
|
|
12
|
+
client = build_client
|
|
13
|
+
return nil unless client
|
|
14
|
+
|
|
15
|
+
bytes = File.binread(file_path)
|
|
16
|
+
|
|
17
|
+
response = client.detect_document_text(
|
|
18
|
+
document: { bytes: }
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
extract_text_from_response(response)
|
|
22
|
+
rescue Aws::Textract::Errors::ServiceError => e
|
|
23
|
+
Rails.logger.error("[Aven::Textract] API error: #{e.message}")
|
|
24
|
+
nil
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Extract text from multi-page document (async)
|
|
28
|
+
# @param s3_bucket [String] S3 bucket name
|
|
29
|
+
# @param s3_key [String] S3 object key
|
|
30
|
+
# @return [String, nil] Extracted text
|
|
31
|
+
def extract_document_async(s3_bucket:, s3_key:)
|
|
32
|
+
client = build_client
|
|
33
|
+
return nil unless client
|
|
34
|
+
|
|
35
|
+
# Start async job
|
|
36
|
+
start_response = client.start_document_text_detection(
|
|
37
|
+
document_location: {
|
|
38
|
+
s3_object: {
|
|
39
|
+
bucket: s3_bucket,
|
|
40
|
+
name: s3_key
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
job_id = start_response.job_id
|
|
46
|
+
wait_for_job(client, job_id)
|
|
47
|
+
rescue Aws::Textract::Errors::ServiceError => e
|
|
48
|
+
Rails.logger.error("[Aven::Textract] Async API error: #{e.message}")
|
|
49
|
+
nil
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def build_client
|
|
55
|
+
return nil unless defined?(Aws::Textract::Client)
|
|
56
|
+
|
|
57
|
+
config = Aven.configuration.ocr
|
|
58
|
+
return nil unless config&.aws_region
|
|
59
|
+
|
|
60
|
+
Aws::Textract::Client.new(
|
|
61
|
+
region: config.aws_region,
|
|
62
|
+
credentials: aws_credentials(config)
|
|
63
|
+
)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def aws_credentials(config)
|
|
67
|
+
if config.aws_access_key_id && config.aws_secret_access_key
|
|
68
|
+
Aws::Credentials.new(
|
|
69
|
+
config.aws_access_key_id,
|
|
70
|
+
config.aws_secret_access_key
|
|
71
|
+
)
|
|
72
|
+
else
|
|
73
|
+
# Use default credential chain
|
|
74
|
+
nil
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def extract_text_from_response(response)
|
|
79
|
+
lines = response.blocks
|
|
80
|
+
.select { |b| b.block_type == "LINE" }
|
|
81
|
+
.sort_by { |b| [b.geometry.bounding_box.top, b.geometry.bounding_box.left] }
|
|
82
|
+
.map(&:text)
|
|
83
|
+
|
|
84
|
+
lines.join("\n")
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def wait_for_job(client, job_id, max_attempts: 30, delay: 5)
|
|
88
|
+
attempts = 0
|
|
89
|
+
|
|
90
|
+
loop do
|
|
91
|
+
response = client.get_document_text_detection(job_id:)
|
|
92
|
+
|
|
93
|
+
case response.job_status
|
|
94
|
+
when "SUCCEEDED"
|
|
95
|
+
return collect_all_pages(client, job_id)
|
|
96
|
+
when "FAILED"
|
|
97
|
+
Rails.logger.error("[Aven::Textract] Job failed: #{response.status_message}")
|
|
98
|
+
return nil
|
|
99
|
+
when "IN_PROGRESS"
|
|
100
|
+
attempts += 1
|
|
101
|
+
if attempts >= max_attempts
|
|
102
|
+
Rails.logger.error("[Aven::Textract] Job timed out")
|
|
103
|
+
return nil
|
|
104
|
+
end
|
|
105
|
+
sleep(delay)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def collect_all_pages(client, job_id)
|
|
111
|
+
all_text = []
|
|
112
|
+
next_token = nil
|
|
113
|
+
|
|
114
|
+
loop do
|
|
115
|
+
params = { job_id: }
|
|
116
|
+
params[:next_token] = next_token if next_token
|
|
117
|
+
|
|
118
|
+
response = client.get_document_text_detection(params)
|
|
119
|
+
all_text << extract_text_from_response(response)
|
|
120
|
+
|
|
121
|
+
next_token = response.next_token
|
|
122
|
+
break unless next_token
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
all_text.join("\n\n")
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Aven
|
|
4
|
+
module Agentic
|
|
5
|
+
module Ocr
|
|
6
|
+
class WordExtractor < BaseExtractor
|
|
7
|
+
class << self
|
|
8
|
+
def extract(document)
|
|
9
|
+
with_tempfile(document) do |path|
|
|
10
|
+
extract_docx(path)
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
private
|
|
15
|
+
|
|
16
|
+
def extract_docx(path)
|
|
17
|
+
# Use docx gem if available
|
|
18
|
+
if defined?(Docx::Document)
|
|
19
|
+
doc = Docx::Document.open(path)
|
|
20
|
+
paragraphs = doc.paragraphs.map(&:text)
|
|
21
|
+
paragraphs.join("\n\n")
|
|
22
|
+
else
|
|
23
|
+
Rails.logger.warn("[Aven::OCR] docx gem not available")
|
|
24
|
+
nil
|
|
25
|
+
end
|
|
26
|
+
rescue => e
|
|
27
|
+
Rails.logger.error("[Aven::OCR] Word extraction failed: #{e.message}")
|
|
28
|
+
nil
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|