aven 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.md +35 -0
  4. data/Rakefile +19 -0
  5. data/app/assets/stylesheets/aven/application.css +14 -0
  6. data/app/assets/stylesheets/aven/application.tailwind.css +7 -0
  7. data/app/assets/stylesheets/aven/tailwind.css +224 -0
  8. data/app/channels/aven/chat/thread_channel.rb +39 -0
  9. data/app/components/aven/application_view_component.rb +15 -0
  10. data/app/components/aven/views/admin/dashboard/index/component.html.erb +1 -0
  11. data/app/components/aven/views/admin/dashboard/index/component.rb +5 -0
  12. data/app/components/aven/views/articles/edit/component.html.erb +14 -0
  13. data/app/components/aven/views/articles/edit/component.rb +14 -0
  14. data/app/components/aven/views/articles/form/component.html.erb +45 -0
  15. data/app/components/aven/views/articles/form/component.rb +27 -0
  16. data/app/components/aven/views/articles/index/component.html.erb +93 -0
  17. data/app/components/aven/views/articles/index/component.rb +29 -0
  18. data/app/components/aven/views/articles/new/component.html.erb +13 -0
  19. data/app/components/aven/views/articles/new/component.rb +14 -0
  20. data/app/components/aven/views/articles/show/component.html.erb +110 -0
  21. data/app/components/aven/views/articles/show/component.rb +34 -0
  22. data/app/components/aven/views/oauth/error/component.html.erb +44 -0
  23. data/app/components/aven/views/oauth/error/component.rb +30 -0
  24. data/app/components/aven/views/static/index/component.html.erb +17 -0
  25. data/app/components/aven/views/static/index/component.rb +16 -0
  26. data/app/components/aven/views/static/index/controller.js +7 -0
  27. data/app/controllers/aven/admin/base.rb +16 -0
  28. data/app/controllers/aven/admin/dashboard_controller.rb +9 -0
  29. data/app/controllers/aven/agentic/agents_controller.rb +56 -0
  30. data/app/controllers/aven/agentic/documents_controller.rb +51 -0
  31. data/app/controllers/aven/agentic/mcp_controller.rb +124 -0
  32. data/app/controllers/aven/agentic/tools_controller.rb +37 -0
  33. data/app/controllers/aven/ai/text_controller.rb +41 -0
  34. data/app/controllers/aven/application_controller.rb +27 -0
  35. data/app/controllers/aven/articles_controller.rb +114 -0
  36. data/app/controllers/aven/auth_controller.rb +12 -0
  37. data/app/controllers/aven/chat/threads_controller.rb +67 -0
  38. data/app/controllers/aven/oauth/auth0_controller.rb +84 -0
  39. data/app/controllers/aven/oauth/base_controller.rb +183 -0
  40. data/app/controllers/aven/oauth/documentation/auth0.md +387 -0
  41. data/app/controllers/aven/oauth/documentation/entra_id.md +608 -0
  42. data/app/controllers/aven/oauth/documentation/github.md +329 -0
  43. data/app/controllers/aven/oauth/documentation/google.md +253 -0
  44. data/app/controllers/aven/oauth/entra_id_controller.rb +92 -0
  45. data/app/controllers/aven/oauth/github_controller.rb +91 -0
  46. data/app/controllers/aven/oauth/google_controller.rb +64 -0
  47. data/app/controllers/aven/static_controller.rb +7 -0
  48. data/app/controllers/aven/tags_controller.rb +44 -0
  49. data/app/controllers/aven/workspaces_controller.rb +20 -0
  50. data/app/controllers/concerns/aven/authentication.rb +49 -0
  51. data/app/controllers/concerns/aven/controller_helpers.rb +38 -0
  52. data/app/helpers/aven/application_helper.rb +16 -0
  53. data/app/javascript/aven/application.js +3 -0
  54. data/app/javascript/aven/controllers/application.js +5 -0
  55. data/app/javascript/aven/controllers/index.js +11 -0
  56. data/app/jobs/aven/agentic/document_embedding_job.rb +28 -0
  57. data/app/jobs/aven/agentic/document_ocr_job.rb +28 -0
  58. data/app/jobs/aven/application_job.rb +4 -0
  59. data/app/jobs/aven/chat/calculate_cost_job.rb +26 -0
  60. data/app/jobs/aven/chat/run_job.rb +27 -0
  61. data/app/mailers/aven/application_mailer.rb +6 -0
  62. data/app/models/aven/agentic/agent.rb +76 -0
  63. data/app/models/aven/agentic/agent_document.rb +37 -0
  64. data/app/models/aven/agentic/agent_tool.rb +37 -0
  65. data/app/models/aven/agentic/document.rb +162 -0
  66. data/app/models/aven/agentic/document_embedding.rb +39 -0
  67. data/app/models/aven/agentic/tool.rb +106 -0
  68. data/app/models/aven/agentic/tool_parameter.rb +56 -0
  69. data/app/models/aven/application_record.rb +5 -0
  70. data/app/models/aven/article.rb +86 -0
  71. data/app/models/aven/article_attachment.rb +18 -0
  72. data/app/models/aven/article_relationship.rb +26 -0
  73. data/app/models/aven/chat/message.rb +135 -0
  74. data/app/models/aven/chat/thread.rb +159 -0
  75. data/app/models/aven/import/entry.rb +45 -0
  76. data/app/models/aven/import/item_link.rb +36 -0
  77. data/app/models/aven/import/processor.rb +123 -0
  78. data/app/models/aven/import.rb +102 -0
  79. data/app/models/aven/item/embed.rb +54 -0
  80. data/app/models/aven/item/embeddable.rb +141 -0
  81. data/app/models/aven/item/linkable.rb +212 -0
  82. data/app/models/aven/item/schema/builder.rb +139 -0
  83. data/app/models/aven/item/schemaed.rb +252 -0
  84. data/app/models/aven/item/schemas/base.rb +108 -0
  85. data/app/models/aven/item.rb +128 -0
  86. data/app/models/aven/item_link.rb +43 -0
  87. data/app/models/aven/item_schema.rb +87 -0
  88. data/app/models/aven/log.rb +66 -0
  89. data/app/models/aven/loggable.rb +20 -0
  90. data/app/models/aven/user.rb +40 -0
  91. data/app/models/aven/workspace.rb +93 -0
  92. data/app/models/aven/workspace_role.rb +46 -0
  93. data/app/models/aven/workspace_user.rb +54 -0
  94. data/app/models/aven/workspace_user_role.rb +38 -0
  95. data/app/models/concerns/aven/agentic/document_embeddable.rb +58 -0
  96. data/app/models/concerns/aven/searchable.rb +61 -0
  97. data/app/services/aven/agentic/dynamic_tool_builder.rb +81 -0
  98. data/app/services/aven/agentic/mcp/adapter.rb +77 -0
  99. data/app/services/aven/agentic/mcp/result_formatter.rb +57 -0
  100. data/app/services/aven/agentic/mcp/server_factory.rb +43 -0
  101. data/app/services/aven/agentic/ocr/base_extractor.rb +39 -0
  102. data/app/services/aven/agentic/ocr/excel_extractor.rb +43 -0
  103. data/app/services/aven/agentic/ocr/image_extractor.rb +22 -0
  104. data/app/services/aven/agentic/ocr/pdf_extractor.rb +48 -0
  105. data/app/services/aven/agentic/ocr/processor.rb +36 -0
  106. data/app/services/aven/agentic/ocr/textract_client.rb +131 -0
  107. data/app/services/aven/agentic/ocr/word_extractor.rb +34 -0
  108. data/app/services/aven/agentic/tool_result_formatter.rb +76 -0
  109. data/app/services/aven/agentic/tools/base.rb +55 -0
  110. data/app/services/aven/agentic/tools/concerns/boolean_filtering.rb +40 -0
  111. data/app/services/aven/agentic/tools/concerns/enum_filtering.rb +47 -0
  112. data/app/services/aven/agentic/tools/concerns/geo_filtering.rb +56 -0
  113. data/app/services/aven/agentic/tools/concerns/range_filtering.rb +51 -0
  114. data/app/services/aven/chat/broadcaster.rb +59 -0
  115. data/app/services/aven/chat/config.rb +93 -0
  116. data/app/services/aven/chat/message_builder.rb +42 -0
  117. data/app/services/aven/chat/orchestrator.rb +69 -0
  118. data/app/services/aven/chat/runner.rb +105 -0
  119. data/app/services/aven/chat/title_generator.rb +61 -0
  120. data/app/services/aven/external/gmail_client.rb +173 -0
  121. data/app/services/aven/external/google_contacts_client.rb +95 -0
  122. data/app/views/layouts/aven/admin.html.erb +16 -0
  123. data/app/views/layouts/aven/application.html.erb +18 -0
  124. data/config/importmap.rb +16 -0
  125. data/config/routes.rb +63 -0
  126. data/db/migrate/20200101000001_create_aven_users.rb +19 -0
  127. data/db/migrate/20200101000002_create_aven_workspaces.rb +14 -0
  128. data/db/migrate/20200101000003_create_aven_workspace_users.rb +12 -0
  129. data/db/migrate/20200101000004_create_aven_workspace_roles.rb +13 -0
  130. data/db/migrate/20200101000005_create_aven_workspace_user_roles.rb +12 -0
  131. data/db/migrate/20200101000006_create_aven_logs.rb +21 -0
  132. data/db/migrate/20200101000009_create_aven_items.rb +17 -0
  133. data/db/migrate/20200101000010_create_aven_item_links.rb +17 -0
  134. data/db/migrate/20200101000011_create_aven_agentic_tools.rb +19 -0
  135. data/db/migrate/20200101000012_create_aven_agentic_tool_parameters.rb +20 -0
  136. data/db/migrate/20200101000013_create_aven_agentic_documents.rb +22 -0
  137. data/db/migrate/20200101000014_create_aven_agentic_document_embeddings.rb +18 -0
  138. data/db/migrate/20200101000015_create_aven_agentic_agents.rb +18 -0
  139. data/db/migrate/20200101000016_create_aven_agentic_agent_tools.rb +13 -0
  140. data/db/migrate/20200101000017_create_aven_agentic_agent_documents.rb +13 -0
  141. data/db/migrate/20200101000018_create_aven_chat_threads.rb +19 -0
  142. data/db/migrate/20200101000019_create_aven_chat_messages.rb +26 -0
  143. data/db/migrate/20200101000020_add_pg_search_support.rb +21 -0
  144. data/db/migrate/20200101000021_create_aven_item_schemas.rb +18 -0
  145. data/db/migrate/20200101000022_create_aven_imports.rb +23 -0
  146. data/db/migrate/20200101000023_create_aven_import_entries.rb +13 -0
  147. data/db/migrate/20200101000024_create_aven_import_item_links.rb +13 -0
  148. data/db/migrate/20200101000025_create_aven_articles.rb +19 -0
  149. data/db/migrate/20200101000026_create_aven_article_attachments.rb +13 -0
  150. data/db/migrate/20200101000027_create_aven_article_relationships.rb +15 -0
  151. data/lib/aven/configuration.rb +87 -0
  152. data/lib/aven/engine.rb +43 -0
  153. data/lib/aven/model/tenant_model.rb +91 -0
  154. data/lib/aven/model.rb +6 -0
  155. data/lib/aven/version.rb +3 -0
  156. data/lib/aven.rb +8 -0
  157. data/lib/tasks/annotate_rb.rake +10 -0
  158. data/lib/tasks/aven_tasks.rake +21 -0
  159. metadata +426 -0
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Searchable
5
+ extend ActiveSupport::Concern
6
+
7
+ included do
8
+ include PgSearch::Model
9
+ end
10
+
11
+ class_methods do
12
+ # DSL for configuring search
13
+ #
14
+ # @example Basic usage
15
+ # class Product < ApplicationRecord
16
+ # include Aven::Searchable
17
+ #
18
+ # searchable against: [:name, :description]
19
+ # end
20
+ #
21
+ # @example With options
22
+ # searchable against: [:name, :description],
23
+ # using: { tsearch: { prefix: true, dictionary: "english" } },
24
+ # ranked_by: ":tsearch"
25
+ #
26
+ # @example With associations
27
+ # searchable against: [:title],
28
+ # associated_against: { author: :name, tags: :label }
29
+ #
30
+ # @example With tsvector column (for performance)
31
+ # searchable against: [:name, :description],
32
+ # using: { tsearch: { tsvector_column: "searchable" } }
33
+ #
34
+ def searchable(against:, **options)
35
+ pg_search_scope(:search,
36
+ against:,
37
+ **options.reverse_merge(
38
+ using: {
39
+ tsearch: { prefix: true, negation: true }
40
+ }
41
+ ))
42
+
43
+ # Convenience scope for workspace + search
44
+ if column_names.include?("workspace_id")
45
+ scope :search_in_workspace, ->(workspace, query) {
46
+ where(workspace_id: workspace.id).search(query)
47
+ }
48
+ end
49
+ end
50
+
51
+ # Multi-search registration (global search across models)
52
+ #
53
+ # @example
54
+ # searchable_globally against: [:name, :description]
55
+ #
56
+ def searchable_globally(against:, **options)
57
+ multisearchable against:, **options
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ class DynamicToolBuilder
6
+ class << self
7
+ # Build a RubyLLM tool from a database record
8
+ # @param tool_record [Aven::Agentic::Tool] Database record
9
+ # @return [Class] RubyLLM::Tool subclass
10
+ def build(tool_record)
11
+ return nil unless tool_record.valid_class?
12
+
13
+ tool_class = tool_record.tool_class
14
+ tool_description = tool_record.effective_description
15
+ tool_parameters = tool_record.parameters.to_a
16
+ tool_name = tool_record.name
17
+
18
+ # Create dynamic RubyLLM tool class
19
+ Class.new(RubyLLM::Tool) do
20
+ @search_tool_class = tool_class
21
+ @tool_record = tool_record
22
+ @tool_name = tool_name
23
+
24
+ class << self
25
+ attr_reader :search_tool_class, :tool_record, :tool_name
26
+ end
27
+
28
+ description tool_description
29
+
30
+ # Build parameters using param DSL
31
+ tool_parameters.each do |p|
32
+ param_name = p.name.to_sym
33
+ param_desc = p.effective_description
34
+ param_type = case p.param_type.to_sym
35
+ when :integer, :float then :number
36
+ when :array then :array
37
+ when :boolean then :boolean
38
+ else :string
39
+ end
40
+
41
+ param param_name, type: param_type, desc: param_desc, required: p.required?
42
+ end
43
+
44
+ # Override name instance method
45
+ def name
46
+ self.class.tool_name
47
+ end
48
+
49
+ # Execute delegates to actual tool class
50
+ def execute(**params)
51
+ result = self.class.search_tool_class.call(**params)
52
+ Aven::Agentic::ToolResultFormatter.format(self.class.tool_name, result)
53
+ end
54
+ end
55
+ end
56
+
57
+ # Build all enabled tools for a workspace
58
+ # @param workspace [Aven::Workspace, nil] Workspace to scope tools
59
+ # @return [Array<Class>] Array of RubyLLM::Tool subclasses
60
+ def build_all(workspace: nil)
61
+ scope = Aven::Agentic::Tool.enabled.includes(:parameters)
62
+ scope = scope.for_workspace(workspace) if workspace
63
+
64
+ scope.map { |tool_record| build(tool_record) }.compact
65
+ end
66
+
67
+ # Get cached tool or build fresh
68
+ def cached_build(tool_record)
69
+ @tool_cache ||= {}
70
+ cache_key = "#{tool_record.id}/#{tool_record.updated_at.to_f}"
71
+
72
+ @tool_cache[cache_key] ||= build(tool_record)
73
+ end
74
+
75
+ def clear_cache!
76
+ @tool_cache = {}
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Mcp
6
+ class Adapter
7
+ def initialize(tool_class, context = {})
8
+ @tool_class = tool_class
9
+ @context = context
10
+ end
11
+
12
+ # Convert tool to MCP tool format
13
+ def to_mcp_tool
14
+ {
15
+ name: tool_name,
16
+ description: tool_description,
17
+ input_schema: build_input_schema,
18
+ handler: method(:handle)
19
+ }
20
+ end
21
+
22
+ # Handle MCP tool call
23
+ def handle(params)
24
+ result = @tool_class.new.execute(**params.symbolize_keys)
25
+ ResultFormatter.format(result)
26
+ rescue => e
27
+ Rails.logger.error("[Aven::MCP] Tool execution error: #{e.message}")
28
+ ResultFormatter.format_error(e)
29
+ end
30
+
31
+ private
32
+
33
+ def tool_name
34
+ @tool_class.tool_name
35
+ end
36
+
37
+ def tool_description
38
+ @tool_class.class.tool_record&.effective_description || "No description"
39
+ end
40
+
41
+ def build_input_schema
42
+ properties = {}
43
+ required = []
44
+
45
+ tool_record = @tool_class.class.tool_record
46
+ return { type: "object", properties: {} } unless tool_record
47
+
48
+ tool_record.parameters.each do |param|
49
+ properties[param.name] = {
50
+ type: json_schema_type(param.param_type),
51
+ description: param.effective_description
52
+ }
53
+
54
+ required << param.name if param.required?
55
+ end
56
+
57
+ {
58
+ type: "object",
59
+ properties:,
60
+ required:
61
+ }
62
+ end
63
+
64
+ def json_schema_type(param_type)
65
+ case param_type.to_sym
66
+ when :integer then "integer"
67
+ when :float then "number"
68
+ when :boolean then "boolean"
69
+ when :array then "array"
70
+ when :object then "object"
71
+ else "string"
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Mcp
6
+ class ResultFormatter
7
+ class << self
8
+ # Format successful result for MCP response
9
+ # @param result [Object] Tool execution result
10
+ # @return [Hash] MCP-formatted result
11
+ def format(result)
12
+ {
13
+ content: [
14
+ {
15
+ type: "text",
16
+ text: format_content(result)
17
+ }
18
+ ]
19
+ }
20
+ end
21
+
22
+ # Format error for MCP response
23
+ # @param error [Exception] Error that occurred
24
+ # @return [Hash] MCP-formatted error
25
+ def format_error(error)
26
+ {
27
+ content: [
28
+ {
29
+ type: "text",
30
+ text: "Error: #{error.message}"
31
+ }
32
+ ],
33
+ isError: true
34
+ }
35
+ end
36
+
37
+ private
38
+
39
+ def format_content(result)
40
+ case result
41
+ when String
42
+ result
43
+ when Hash
44
+ JSON.pretty_generate(result)
45
+ when Array
46
+ result.map { |item| format_content(item) }.join("\n\n")
47
+ when nil
48
+ "No result"
49
+ else
50
+ result.to_s
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Mcp
6
+ class ServerFactory
7
+ SERVER_NAME = "aven-mcp-server"
8
+ SERVER_VERSION = Aven::VERSION
9
+
10
+ class << self
11
+ # Build an MCP server instance
12
+ # @param server_context [Hash] Context data for the server
13
+ # @return [MCP::Server] Configured MCP server
14
+ def build(server_context: {})
15
+ return nil unless defined?(::MCP::Server)
16
+
17
+ server = ::MCP::Server.new(
18
+ name: SERVER_NAME,
19
+ version: SERVER_VERSION
20
+ )
21
+
22
+ # Register tools
23
+ register_tools(server, server_context)
24
+
25
+ server
26
+ end
27
+
28
+ private
29
+
30
+ def register_tools(server, context)
31
+ workspace = context[:workspace]
32
+ tools = Aven::Agentic::DynamicToolBuilder.build_all(workspace:)
33
+
34
+ tools.each do |tool_class|
35
+ adapter = Adapter.new(tool_class, context)
36
+ server.register_tool(adapter.to_mcp_tool)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Ocr
6
+ class BaseExtractor
7
+ class << self
8
+ # Extract text from document
9
+ # @param document [Aven::Agentic::Document]
10
+ # @return [String, nil] Extracted text
11
+ def extract(document)
12
+ raise NotImplementedError, "#{name} must implement extract"
13
+ end
14
+
15
+ protected
16
+
17
+ # Download file to temp location
18
+ def with_tempfile(document, &block)
19
+ return nil unless document.file.attached?
20
+
21
+ extension = File.extname(document.filename)
22
+ tempfile = Tempfile.new(["aven_ocr", extension])
23
+
24
+ begin
25
+ tempfile.binmode
26
+ tempfile.write(document.file.download)
27
+ tempfile.rewind
28
+
29
+ yield tempfile.path
30
+ ensure
31
+ tempfile.close
32
+ tempfile.unlink
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Ocr
6
+ class ExcelExtractor < BaseExtractor
7
+ class << self
8
+ def extract(document)
9
+ with_tempfile(document) do |path|
10
+ extract_xlsx(path)
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ def extract_xlsx(path)
17
+ # Use roo gem if available
18
+ if defined?(Roo::Spreadsheet)
19
+ xlsx = Roo::Spreadsheet.open(path)
20
+ sheets = []
21
+
22
+ xlsx.sheets.each do |sheet_name|
23
+ sheet = xlsx.sheet(sheet_name)
24
+ rows = sheet.each.map do |row|
25
+ row.map { |cell| cell.to_s.strip }.join("\t")
26
+ end
27
+ sheets << "## #{sheet_name}\n\n#{rows.join("\n")}"
28
+ end
29
+
30
+ sheets.join("\n\n---\n\n")
31
+ else
32
+ Rails.logger.warn("[Aven::OCR] roo gem not available")
33
+ nil
34
+ end
35
+ rescue => e
36
+ Rails.logger.error("[Aven::OCR] Excel extraction failed: #{e.message}")
37
+ nil
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Ocr
6
+ class ImageExtractor < BaseExtractor
7
+ class << self
8
+ def extract(document)
9
+ with_tempfile(document) do |path|
10
+ if Aven.configuration.ocr&.provider == :textract
11
+ TextractClient.extract_document(path)
12
+ else
13
+ Rails.logger.warn("[Aven::OCR] No OCR provider configured for images")
14
+ nil
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Ocr
6
+ class PdfExtractor < BaseExtractor
7
+ class << self
8
+ def extract(document)
9
+ with_tempfile(document) do |path|
10
+ # Try text extraction first (for text-based PDFs)
11
+ text = extract_text_layer(path)
12
+ return text if text.present?
13
+
14
+ # Fall back to OCR for scanned PDFs
15
+ extract_with_ocr(path)
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ def extract_text_layer(path)
22
+ # Use pdf-reader gem if available
23
+ if defined?(PDF::Reader)
24
+ reader = PDF::Reader.new(path)
25
+ text = reader.pages.map(&:text).join("\n\n")
26
+ return text if text.strip.present?
27
+ end
28
+
29
+ nil
30
+ rescue => e
31
+ Rails.logger.warn("[Aven::OCR] PDF text extraction failed: #{e.message}")
32
+ nil
33
+ end
34
+
35
+ def extract_with_ocr(path)
36
+ # Use AWS Textract if configured
37
+ if Aven.configuration.ocr&.provider == :textract
38
+ TextractClient.extract_document(path)
39
+ else
40
+ Rails.logger.warn("[Aven::OCR] No OCR provider configured")
41
+ nil
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Ocr
6
+ class Processor
7
+ class << self
8
+ # Process a document and extract text content
9
+ # @param document [Aven::Agentic::Document]
10
+ # @return [String, nil] Extracted text content
11
+ def process(document)
12
+ extractor = extractor_for(document)
13
+ return nil unless extractor
14
+
15
+ extractor.extract(document)
16
+ end
17
+
18
+ private
19
+
20
+ def extractor_for(document)
21
+ case
22
+ when document.pdf?
23
+ PdfExtractor
24
+ when document.image?
25
+ ImageExtractor
26
+ when document.word_doc?
27
+ WordExtractor
28
+ when document.excel?
29
+ ExcelExtractor
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Ocr
6
+ class TextractClient
7
+ class << self
8
+ # Extract text from document using AWS Textract
9
+ # @param file_path [String] Path to the file
10
+ # @return [String, nil] Extracted text
11
+ def extract_document(file_path)
12
+ client = build_client
13
+ return nil unless client
14
+
15
+ bytes = File.binread(file_path)
16
+
17
+ response = client.detect_document_text(
18
+ document: { bytes: }
19
+ )
20
+
21
+ extract_text_from_response(response)
22
+ rescue Aws::Textract::Errors::ServiceError => e
23
+ Rails.logger.error("[Aven::Textract] API error: #{e.message}")
24
+ nil
25
+ end
26
+
27
+ # Extract text from multi-page document (async)
28
+ # @param s3_bucket [String] S3 bucket name
29
+ # @param s3_key [String] S3 object key
30
+ # @return [String, nil] Extracted text
31
+ def extract_document_async(s3_bucket:, s3_key:)
32
+ client = build_client
33
+ return nil unless client
34
+
35
+ # Start async job
36
+ start_response = client.start_document_text_detection(
37
+ document_location: {
38
+ s3_object: {
39
+ bucket: s3_bucket,
40
+ name: s3_key
41
+ }
42
+ }
43
+ )
44
+
45
+ job_id = start_response.job_id
46
+ wait_for_job(client, job_id)
47
+ rescue Aws::Textract::Errors::ServiceError => e
48
+ Rails.logger.error("[Aven::Textract] Async API error: #{e.message}")
49
+ nil
50
+ end
51
+
52
+ private
53
+
54
+ def build_client
55
+ return nil unless defined?(Aws::Textract::Client)
56
+
57
+ config = Aven.configuration.ocr
58
+ return nil unless config&.aws_region
59
+
60
+ Aws::Textract::Client.new(
61
+ region: config.aws_region,
62
+ credentials: aws_credentials(config)
63
+ )
64
+ end
65
+
66
+ def aws_credentials(config)
67
+ if config.aws_access_key_id && config.aws_secret_access_key
68
+ Aws::Credentials.new(
69
+ config.aws_access_key_id,
70
+ config.aws_secret_access_key
71
+ )
72
+ else
73
+ # Use default credential chain
74
+ nil
75
+ end
76
+ end
77
+
78
+ def extract_text_from_response(response)
79
+ lines = response.blocks
80
+ .select { |b| b.block_type == "LINE" }
81
+ .sort_by { |b| [b.geometry.bounding_box.top, b.geometry.bounding_box.left] }
82
+ .map(&:text)
83
+
84
+ lines.join("\n")
85
+ end
86
+
87
+ def wait_for_job(client, job_id, max_attempts: 30, delay: 5)
88
+ attempts = 0
89
+
90
+ loop do
91
+ response = client.get_document_text_detection(job_id:)
92
+
93
+ case response.job_status
94
+ when "SUCCEEDED"
95
+ return collect_all_pages(client, job_id)
96
+ when "FAILED"
97
+ Rails.logger.error("[Aven::Textract] Job failed: #{response.status_message}")
98
+ return nil
99
+ when "IN_PROGRESS"
100
+ attempts += 1
101
+ if attempts >= max_attempts
102
+ Rails.logger.error("[Aven::Textract] Job timed out")
103
+ return nil
104
+ end
105
+ sleep(delay)
106
+ end
107
+ end
108
+ end
109
+
110
+ def collect_all_pages(client, job_id)
111
+ all_text = []
112
+ next_token = nil
113
+
114
+ loop do
115
+ params = { job_id: }
116
+ params[:next_token] = next_token if next_token
117
+
118
+ response = client.get_document_text_detection(params)
119
+ all_text << extract_text_from_response(response)
120
+
121
+ next_token = response.next_token
122
+ break unless next_token
123
+ end
124
+
125
+ all_text.join("\n\n")
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aven
4
+ module Agentic
5
+ module Ocr
6
+ class WordExtractor < BaseExtractor
7
+ class << self
8
+ def extract(document)
9
+ with_tempfile(document) do |path|
10
+ extract_docx(path)
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ def extract_docx(path)
17
+ # Use docx gem if available
18
+ if defined?(Docx::Document)
19
+ doc = Docx::Document.open(path)
20
+ paragraphs = doc.paragraphs.map(&:text)
21
+ paragraphs.join("\n\n")
22
+ else
23
+ Rails.logger.warn("[Aven::OCR] docx gem not available")
24
+ nil
25
+ end
26
+ rescue => e
27
+ Rails.logger.error("[Aven::OCR] Word extraction failed: #{e.message}")
28
+ nil
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end