ragdoll 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/Rakefile +52 -1
- data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
- data/app/jobs/ragdoll/extract_text_job.rb +38 -0
- data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
- data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
- data/app/lib/ragdoll/metadata_schemas.rb +332 -0
- data/app/models/ragdoll/audio_content.rb +142 -0
- data/app/models/ragdoll/content.rb +95 -0
- data/app/models/ragdoll/document.rb +611 -0
- data/app/models/ragdoll/embedding.rb +176 -0
- data/app/models/ragdoll/image_content.rb +194 -0
- data/app/models/ragdoll/text_content.rb +137 -0
- data/app/services/ragdoll/configuration_service.rb +113 -0
- data/app/services/ragdoll/document_management.rb +108 -0
- data/app/services/ragdoll/document_processor.rb +342 -0
- data/app/services/ragdoll/embedding_service.rb +202 -0
- data/app/services/ragdoll/image_description_service.rb +230 -0
- data/app/services/ragdoll/metadata_generator.rb +329 -0
- data/app/services/ragdoll/model_resolver.rb +72 -0
- data/app/services/ragdoll/search_engine.rb +51 -0
- data/app/services/ragdoll/text_chunker.rb +208 -0
- data/app/services/ragdoll/text_generation_service.rb +355 -0
- data/lib/ragdoll/core/client.rb +32 -41
- data/lib/ragdoll/core/configuration.rb +140 -156
- data/lib/ragdoll/core/database.rb +1 -1
- data/lib/ragdoll/core/model.rb +45 -0
- data/lib/ragdoll/core/version.rb +1 -1
- data/lib/ragdoll/core.rb +35 -17
- data/lib/ragdoll.rb +1 -1
- data/lib/tasks/annotate.rake +1 -1
- data/lib/tasks/db.rake +2 -2
- metadata +24 -20
- data/lib/ragdoll/core/document_management.rb +0 -110
- data/lib/ragdoll/core/document_processor.rb +0 -344
- data/lib/ragdoll/core/embedding_service.rb +0 -183
- data/lib/ragdoll/core/jobs/extract_keywords.rb +0 -32
- data/lib/ragdoll/core/jobs/extract_text.rb +0 -42
- data/lib/ragdoll/core/jobs/generate_embeddings.rb +0 -32
- data/lib/ragdoll/core/jobs/generate_summary.rb +0 -29
- data/lib/ragdoll/core/metadata_schemas.rb +0 -334
- data/lib/ragdoll/core/models/audio_content.rb +0 -175
- data/lib/ragdoll/core/models/content.rb +0 -126
- data/lib/ragdoll/core/models/document.rb +0 -678
- data/lib/ragdoll/core/models/embedding.rb +0 -204
- data/lib/ragdoll/core/models/image_content.rb +0 -227
- data/lib/ragdoll/core/models/text_content.rb +0 -169
- data/lib/ragdoll/core/search_engine.rb +0 -50
- data/lib/ragdoll/core/services/image_description_service.rb +0 -230
- data/lib/ragdoll/core/services/metadata_generator.rb +0 -335
- data/lib/ragdoll/core/text_chunker.rb +0 -210
- data/lib/ragdoll/core/text_generation_service.rb +0 -360
data/lib/ragdoll/core.rb
CHANGED
@@ -7,26 +7,44 @@ $DEBUG_ME = true
|
|
7
7
|
|
8
8
|
# require_relative "../extensions/openstruct_merge" # File doesn't exist
|
9
9
|
|
10
|
+
# Add app/models, app/jobs, app/services, and app/lib to the load path
|
11
|
+
$LOAD_PATH.unshift(File.expand_path("../../app/models", __dir__))
|
12
|
+
$LOAD_PATH.unshift(File.expand_path("../../app/jobs", __dir__))
|
13
|
+
$LOAD_PATH.unshift(File.expand_path("../../app/services", __dir__))
|
14
|
+
$LOAD_PATH.unshift(File.expand_path("../../app/lib", __dir__))
|
15
|
+
|
10
16
|
require_relative "core/version"
|
11
17
|
require_relative "core/errors"
|
18
|
+
require_relative "core/model"
|
12
19
|
require_relative "core/configuration"
|
20
|
+
# Require services from app/services/ragdoll
|
21
|
+
require "ragdoll/configuration_service"
|
22
|
+
require "ragdoll/model_resolver"
|
13
23
|
require_relative "core/database"
|
14
24
|
require_relative "core/shrine_config"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
25
|
+
|
26
|
+
# Require models from app/models/ragdoll
|
27
|
+
require "ragdoll/document"
|
28
|
+
require "ragdoll/embedding"
|
29
|
+
require "ragdoll/content"
|
30
|
+
require "ragdoll/text_content"
|
31
|
+
require "ragdoll/audio_content"
|
32
|
+
require "ragdoll/image_content"
|
33
|
+
require "ragdoll/document_processor"
|
34
|
+
require "ragdoll/document_management"
|
35
|
+
require "ragdoll/text_chunker"
|
36
|
+
require "ragdoll/embedding_service"
|
37
|
+
require "ragdoll/text_generation_service"
|
38
|
+
require "ragdoll/search_engine"
|
39
|
+
require "ragdoll/image_description_service"
|
40
|
+
require "ragdoll/metadata_generator"
|
41
|
+
# Require from app/lib/ragdoll
|
42
|
+
require "ragdoll/metadata_schemas"
|
43
|
+
# Require jobs from app/jobs/ragdoll
|
44
|
+
require "ragdoll/generate_embeddings_job"
|
45
|
+
require "ragdoll/generate_summary_job"
|
46
|
+
require "ragdoll/extract_keywords_job"
|
47
|
+
require "ragdoll/extract_text_job"
|
30
48
|
require_relative "core/client"
|
31
49
|
|
32
50
|
module Ragdoll
|
@@ -56,8 +74,8 @@ module Ragdoll
|
|
56
74
|
end
|
57
75
|
|
58
76
|
# Factory method for creating clients
|
59
|
-
def self.client(
|
60
|
-
Client.new
|
77
|
+
def self.client(_config = nil)
|
78
|
+
Client.new
|
61
79
|
end
|
62
80
|
|
63
81
|
# Delegate high-level API methods to default client
|
data/lib/ragdoll.rb
CHANGED
@@ -137,7 +137,7 @@ module Ragdoll
|
|
137
137
|
# all_docs.each { |doc| puts doc.title }
|
138
138
|
# @return [ActiveRecord::Relation] a relation of all documents.
|
139
139
|
def documents
|
140
|
-
Ragdoll::
|
140
|
+
Ragdoll::Document.all
|
141
141
|
end
|
142
142
|
alias_method :docs, :documents
|
143
143
|
|
data/lib/tasks/annotate.rake
CHANGED
@@ -18,7 +18,7 @@ task :environment do
|
|
18
18
|
adapter: "postgresql",
|
19
19
|
database: "ragdoll_development",
|
20
20
|
username: "ragdoll",
|
21
|
-
password: ENV
|
21
|
+
password: ENV.fetch("RAGDOLL_DATABASE_PASSWORD", ENV["DATABASE_PASSWORD"]),
|
22
22
|
host: "localhost",
|
23
23
|
port: 5432,
|
24
24
|
auto_migrate: false # Don't auto-migrate during annotation
|
data/lib/tasks/db.rake
CHANGED
@@ -18,7 +18,7 @@ namespace :db do
|
|
18
18
|
ActiveRecord::Base.establish_connection(
|
19
19
|
adapter: 'postgresql',
|
20
20
|
database: 'postgres', # Connect to postgres database initially
|
21
|
-
username: ENV
|
21
|
+
username: ENV.fetch('POSTGRES_SUPERUSER', 'postgres'),
|
22
22
|
password: ENV['POSTGRES_SUPERUSER_PASSWORD'],
|
23
23
|
host: config.database_config[:host] || 'localhost',
|
24
24
|
port: config.database_config[:port] || 5432
|
@@ -60,7 +60,7 @@ namespace :db do
|
|
60
60
|
ActiveRecord::Base.establish_connection(
|
61
61
|
adapter: 'postgresql',
|
62
62
|
database: 'ragdoll_development',
|
63
|
-
username: ENV
|
63
|
+
username: ENV.fetch('POSTGRES_SUPERUSER', 'postgres'),
|
64
64
|
password: ENV['POSTGRES_SUPERUSER_PASSWORD'],
|
65
65
|
host: config.database_config[:host] || 'localhost',
|
66
66
|
port: config.database_config[:port] || 5432
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ragdoll
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dewayne VanHoozer
|
@@ -18,6 +18,27 @@ extra_rdoc_files: []
|
|
18
18
|
files:
|
19
19
|
- README.md
|
20
20
|
- Rakefile
|
21
|
+
- app/jobs/ragdoll/extract_keywords_job.rb
|
22
|
+
- app/jobs/ragdoll/extract_text_job.rb
|
23
|
+
- app/jobs/ragdoll/generate_embeddings_job.rb
|
24
|
+
- app/jobs/ragdoll/generate_summary_job.rb
|
25
|
+
- app/lib/ragdoll/metadata_schemas.rb
|
26
|
+
- app/models/ragdoll/audio_content.rb
|
27
|
+
- app/models/ragdoll/content.rb
|
28
|
+
- app/models/ragdoll/document.rb
|
29
|
+
- app/models/ragdoll/embedding.rb
|
30
|
+
- app/models/ragdoll/image_content.rb
|
31
|
+
- app/models/ragdoll/text_content.rb
|
32
|
+
- app/services/ragdoll/configuration_service.rb
|
33
|
+
- app/services/ragdoll/document_management.rb
|
34
|
+
- app/services/ragdoll/document_processor.rb
|
35
|
+
- app/services/ragdoll/embedding_service.rb
|
36
|
+
- app/services/ragdoll/image_description_service.rb
|
37
|
+
- app/services/ragdoll/metadata_generator.rb
|
38
|
+
- app/services/ragdoll/model_resolver.rb
|
39
|
+
- app/services/ragdoll/search_engine.rb
|
40
|
+
- app/services/ragdoll/text_chunker.rb
|
41
|
+
- app/services/ragdoll/text_generation_service.rb
|
21
42
|
- db/migrate/001_enable_postgresql_extensions.rb
|
22
43
|
- db/migrate/004_create_ragdoll_documents.rb
|
23
44
|
- db/migrate/005_create_ragdoll_embeddings.rb
|
@@ -28,27 +49,9 @@ files:
|
|
28
49
|
- lib/ragdoll/core/client.rb
|
29
50
|
- lib/ragdoll/core/configuration.rb
|
30
51
|
- lib/ragdoll/core/database.rb
|
31
|
-
- lib/ragdoll/core/document_management.rb
|
32
|
-
- lib/ragdoll/core/document_processor.rb
|
33
|
-
- lib/ragdoll/core/embedding_service.rb
|
34
52
|
- lib/ragdoll/core/errors.rb
|
35
|
-
- lib/ragdoll/core/
|
36
|
-
- lib/ragdoll/core/jobs/extract_text.rb
|
37
|
-
- lib/ragdoll/core/jobs/generate_embeddings.rb
|
38
|
-
- lib/ragdoll/core/jobs/generate_summary.rb
|
39
|
-
- lib/ragdoll/core/metadata_schemas.rb
|
40
|
-
- lib/ragdoll/core/models/audio_content.rb
|
41
|
-
- lib/ragdoll/core/models/content.rb
|
42
|
-
- lib/ragdoll/core/models/document.rb
|
43
|
-
- lib/ragdoll/core/models/embedding.rb
|
44
|
-
- lib/ragdoll/core/models/image_content.rb
|
45
|
-
- lib/ragdoll/core/models/text_content.rb
|
46
|
-
- lib/ragdoll/core/search_engine.rb
|
47
|
-
- lib/ragdoll/core/services/image_description_service.rb
|
48
|
-
- lib/ragdoll/core/services/metadata_generator.rb
|
53
|
+
- lib/ragdoll/core/model.rb
|
49
54
|
- lib/ragdoll/core/shrine_config.rb
|
50
|
-
- lib/ragdoll/core/text_chunker.rb
|
51
|
-
- lib/ragdoll/core/text_generation_service.rb
|
52
55
|
- lib/ragdoll/core/version.rb
|
53
56
|
- lib/tasks/annotate.rake
|
54
57
|
- lib/tasks/db.rake
|
@@ -63,6 +66,7 @@ metadata:
|
|
63
66
|
rdoc_options: []
|
64
67
|
require_paths:
|
65
68
|
- lib
|
69
|
+
- app/models
|
66
70
|
required_ruby_version: !ruby/object:Gem::Requirement
|
67
71
|
requirements:
|
68
72
|
- - ">="
|
@@ -1,110 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Ragdoll
|
4
|
-
module Core
|
5
|
-
class DocumentManagement
|
6
|
-
class << self
|
7
|
-
def add_document(location, content, metadata = {})
|
8
|
-
# Ensure location is an absolute path if it's a file path
|
9
|
-
absolute_location = location.start_with?("http") || location.start_with?("ftp") ? location : File.expand_path(location)
|
10
|
-
|
11
|
-
# Get file modification time if it's a file path
|
12
|
-
file_modified_at = if File.exist?(absolute_location) && !absolute_location.start_with?("http")
|
13
|
-
File.mtime(absolute_location)
|
14
|
-
else
|
15
|
-
Time.current
|
16
|
-
end
|
17
|
-
|
18
|
-
# Check if document already exists with same location and file_modified_at
|
19
|
-
existing_document = Models::Document.find_by(
|
20
|
-
location: absolute_location,
|
21
|
-
file_modified_at: file_modified_at
|
22
|
-
)
|
23
|
-
|
24
|
-
# Return existing document ID if found (skip duplicate)
|
25
|
-
return existing_document.id.to_s if existing_document
|
26
|
-
|
27
|
-
document = Models::Document.create!(
|
28
|
-
location: absolute_location,
|
29
|
-
title: metadata[:title] || metadata["title"] || extract_title_from_location(location),
|
30
|
-
document_type: metadata[:document_type] || metadata["document_type"] || "text",
|
31
|
-
metadata: metadata.is_a?(Hash) ? metadata : {},
|
32
|
-
status: "pending",
|
33
|
-
file_modified_at: file_modified_at
|
34
|
-
)
|
35
|
-
|
36
|
-
# Set content using the model's setter to trigger TextContent creation
|
37
|
-
document.content = content if content.present?
|
38
|
-
|
39
|
-
document.id.to_s
|
40
|
-
end
|
41
|
-
|
42
|
-
def get_document(id)
|
43
|
-
document = Models::Document.find_by(id: id)
|
44
|
-
return nil unless document
|
45
|
-
|
46
|
-
hash = document.to_hash
|
47
|
-
hash[:content] = document.content
|
48
|
-
hash
|
49
|
-
end
|
50
|
-
|
51
|
-
def update_document(id, **updates)
|
52
|
-
document = Models::Document.find_by(id: id)
|
53
|
-
return nil unless document
|
54
|
-
|
55
|
-
# Only update allowed fields
|
56
|
-
allowed_updates = updates.slice(:title, :metadata, :status, :document_type)
|
57
|
-
document.update!(allowed_updates) if allowed_updates.any?
|
58
|
-
|
59
|
-
document.to_hash
|
60
|
-
end
|
61
|
-
|
62
|
-
def delete_document(id)
|
63
|
-
document = Models::Document.find_by(id: id)
|
64
|
-
return nil unless document
|
65
|
-
|
66
|
-
document.destroy!
|
67
|
-
true
|
68
|
-
end
|
69
|
-
|
70
|
-
def list_documents(options = {})
|
71
|
-
limit = options[:limit] || 100
|
72
|
-
offset = options[:offset] || 0
|
73
|
-
|
74
|
-
Models::Document.offset(offset).limit(limit).recent.map(&:to_hash)
|
75
|
-
end
|
76
|
-
|
77
|
-
def get_document_stats
|
78
|
-
Models::Document.stats
|
79
|
-
end
|
80
|
-
|
81
|
-
# FIXME: should this be here?
|
82
|
-
|
83
|
-
def add_embedding(embeddable_id, chunk_index, embedding_vector, metadata = {})
|
84
|
-
# The embeddable_type should be the actual STI subclass, not the base class
|
85
|
-
embeddable_type = if metadata[:embeddable_type]
|
86
|
-
metadata[:embeddable_type]
|
87
|
-
else
|
88
|
-
# Look up the actual STI type from the content record
|
89
|
-
content = Models::Content.find(embeddable_id)
|
90
|
-
content.class.name
|
91
|
-
end
|
92
|
-
|
93
|
-
Models::Embedding.create!(
|
94
|
-
embeddable_id: embeddable_id,
|
95
|
-
embeddable_type: embeddable_type,
|
96
|
-
chunk_index: chunk_index,
|
97
|
-
embedding_vector: embedding_vector,
|
98
|
-
content: metadata[:content] || ""
|
99
|
-
).id.to_s
|
100
|
-
end
|
101
|
-
|
102
|
-
private
|
103
|
-
|
104
|
-
def extract_title_from_location(location)
|
105
|
-
File.basename(location, File.extname(location))
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
@@ -1,344 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "pdf-reader"
|
4
|
-
require "docx"
|
5
|
-
require "rmagick"
|
6
|
-
require_relative "services/image_description_service"
|
7
|
-
|
8
|
-
module Ragdoll
|
9
|
-
module Core
|
10
|
-
class DocumentProcessor
|
11
|
-
class ParseError < DocumentError; end
|
12
|
-
class UnsupportedFormatError < ParseError; end
|
13
|
-
|
14
|
-
def self.parse(file_path)
|
15
|
-
new(file_path).parse
|
16
|
-
end
|
17
|
-
|
18
|
-
# Parse from Shrine attached file
|
19
|
-
def self.parse_attachment(attached_file)
|
20
|
-
attached_file.open do |tempfile|
|
21
|
-
new(tempfile.path, attached_file).parse
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# Create document from file path
|
26
|
-
def self.create_document_from_file(file_path, **options)
|
27
|
-
parsed = parse(file_path)
|
28
|
-
|
29
|
-
# Get file modification time
|
30
|
-
file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current
|
31
|
-
|
32
|
-
document = Models::Document.create!(
|
33
|
-
location: File.expand_path(file_path),
|
34
|
-
title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
|
35
|
-
content: parsed[:content],
|
36
|
-
document_type: determine_document_type(file_path),
|
37
|
-
metadata: parsed[:metadata] || {},
|
38
|
-
status: "processed",
|
39
|
-
file_modified_at: file_modified_at,
|
40
|
-
**options
|
41
|
-
)
|
42
|
-
|
43
|
-
# Attach the file if it exists
|
44
|
-
document.file = File.open(file_path) if File.exist?(file_path)
|
45
|
-
|
46
|
-
document
|
47
|
-
end
|
48
|
-
|
49
|
-
# Create document from uploaded file (Shrine compatible)
|
50
|
-
def self.create_document_from_upload(uploaded_file, **options)
|
51
|
-
# Create document first
|
52
|
-
document = Models::Document.create!(
|
53
|
-
location: uploaded_file.original_filename || "uploaded_file",
|
54
|
-
title: options[:title] || File.basename(uploaded_file.original_filename || "uploaded_file",
|
55
|
-
File.extname(uploaded_file.original_filename || "")),
|
56
|
-
content: "", # Will be extracted after file attachment
|
57
|
-
document_type: determine_document_type_from_content_type(uploaded_file.mime_type),
|
58
|
-
status: "processing",
|
59
|
-
metadata: options[:metadata] || {},
|
60
|
-
file_modified_at: Time.current
|
61
|
-
)
|
62
|
-
|
63
|
-
# Attach the file
|
64
|
-
document.file = uploaded_file
|
65
|
-
|
66
|
-
# Extract content from attached file
|
67
|
-
if document.file.present?
|
68
|
-
parsed = parse_attachment(document.file)
|
69
|
-
document.update!(
|
70
|
-
content: parsed[:content],
|
71
|
-
title: parsed[:title] || document.title,
|
72
|
-
metadata: document.metadata.merge(parsed[:metadata] || {}),
|
73
|
-
status: "processed"
|
74
|
-
)
|
75
|
-
end
|
76
|
-
|
77
|
-
document
|
78
|
-
end
|
79
|
-
|
80
|
-
def initialize(file_path, attached_file = nil)
|
81
|
-
@file_path = file_path
|
82
|
-
@attached_file = attached_file
|
83
|
-
@file_extension = File.extname(file_path).downcase
|
84
|
-
end
|
85
|
-
|
86
|
-
def parse
|
87
|
-
case @file_extension
|
88
|
-
when ".pdf"
|
89
|
-
parse_pdf
|
90
|
-
when ".docx"
|
91
|
-
parse_docx
|
92
|
-
when ".txt", ".md", ".markdown"
|
93
|
-
parse_text
|
94
|
-
when ".html", ".htm"
|
95
|
-
parse_html
|
96
|
-
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
|
97
|
-
parse_image
|
98
|
-
else
|
99
|
-
parse_text # Default to text parsing for unknown formats
|
100
|
-
end
|
101
|
-
rescue StandardError => e # StandardError => e
|
102
|
-
raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
|
103
|
-
end
|
104
|
-
|
105
|
-
private
|
106
|
-
|
107
|
-
def parse_pdf
|
108
|
-
content = ""
|
109
|
-
metadata = {}
|
110
|
-
|
111
|
-
begin
|
112
|
-
PDF::Reader.open(@file_path) do |reader|
|
113
|
-
# Extract metadata
|
114
|
-
if reader.info
|
115
|
-
metadata[:title] = reader.info[:Title] if reader.info[:Title]
|
116
|
-
metadata[:author] = reader.info[:Author] if reader.info[:Author]
|
117
|
-
metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
|
118
|
-
metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
|
119
|
-
metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
|
120
|
-
metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
|
121
|
-
metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
|
122
|
-
end
|
123
|
-
|
124
|
-
metadata[:page_count] = reader.page_count
|
125
|
-
|
126
|
-
# Extract text from all pages
|
127
|
-
reader.pages.each_with_index do |page, index|
|
128
|
-
page_text = page.text.strip
|
129
|
-
next if page_text.empty?
|
130
|
-
|
131
|
-
content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
|
132
|
-
content += page_text
|
133
|
-
end
|
134
|
-
end
|
135
|
-
rescue PDF::Reader::MalformedPDFError => e
|
136
|
-
raise ParseError, "Malformed PDF: #{e.message}"
|
137
|
-
rescue PDF::Reader::UnsupportedFeatureError => e
|
138
|
-
raise ParseError, "Unsupported PDF feature: #{e.message}"
|
139
|
-
end
|
140
|
-
|
141
|
-
{
|
142
|
-
content: content.strip,
|
143
|
-
metadata: metadata,
|
144
|
-
document_type: "pdf"
|
145
|
-
}
|
146
|
-
end
|
147
|
-
|
148
|
-
def parse_docx
|
149
|
-
content = ""
|
150
|
-
metadata = {}
|
151
|
-
|
152
|
-
begin
|
153
|
-
doc = Docx::Document.open(@file_path)
|
154
|
-
|
155
|
-
# Extract core properties
|
156
|
-
if doc.core_properties
|
157
|
-
metadata[:title] = doc.core_properties.title if doc.core_properties.title
|
158
|
-
metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
|
159
|
-
metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
|
160
|
-
metadata[:description] = doc.core_properties.description if doc.core_properties.description
|
161
|
-
metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
|
162
|
-
metadata[:created] = doc.core_properties.created if doc.core_properties.created
|
163
|
-
metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
|
164
|
-
if doc.core_properties.last_modified_by
|
165
|
-
metadata[:last_modified_by] =
|
166
|
-
doc.core_properties.last_modified_by
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
# Extract text from paragraphs
|
171
|
-
doc.paragraphs.each do |paragraph|
|
172
|
-
paragraph_text = paragraph.text.strip
|
173
|
-
next if paragraph_text.empty?
|
174
|
-
|
175
|
-
content += "#{paragraph_text}\n\n"
|
176
|
-
end
|
177
|
-
|
178
|
-
# Extract text from tables
|
179
|
-
doc.tables.each_with_index do |table, table_index|
|
180
|
-
content += "\n--- Table #{table_index + 1} ---\n\n"
|
181
|
-
|
182
|
-
table.rows.each do |row|
|
183
|
-
row_text = row.cells.map(&:text).join(" | ")
|
184
|
-
content += "#{row_text}\n" unless row_text.strip.empty?
|
185
|
-
end
|
186
|
-
|
187
|
-
content += "\n"
|
188
|
-
end
|
189
|
-
|
190
|
-
metadata[:paragraph_count] = doc.paragraphs.count
|
191
|
-
metadata[:table_count] = doc.tables.count
|
192
|
-
rescue StandardError => e # StandardError => e
|
193
|
-
raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
|
194
|
-
end
|
195
|
-
|
196
|
-
{
|
197
|
-
content: content.strip,
|
198
|
-
metadata: metadata,
|
199
|
-
document_type: "docx"
|
200
|
-
}
|
201
|
-
end
|
202
|
-
|
203
|
-
def parse_text
|
204
|
-
content = File.read(@file_path, encoding: "UTF-8")
|
205
|
-
metadata = {
|
206
|
-
file_size: File.size(@file_path),
|
207
|
-
encoding: "UTF-8"
|
208
|
-
}
|
209
|
-
|
210
|
-
document_type = case @file_extension
|
211
|
-
when ".md", ".markdown" then "markdown"
|
212
|
-
when ".txt" then "text"
|
213
|
-
else "text"
|
214
|
-
end
|
215
|
-
|
216
|
-
{
|
217
|
-
content: content,
|
218
|
-
metadata: metadata,
|
219
|
-
document_type: document_type
|
220
|
-
}
|
221
|
-
rescue Encoding::InvalidByteSequenceError
|
222
|
-
# Try with different encoding
|
223
|
-
content = File.read(@file_path, encoding: "ISO-8859-1")
|
224
|
-
metadata = {
|
225
|
-
file_size: File.size(@file_path),
|
226
|
-
encoding: "ISO-8859-1"
|
227
|
-
}
|
228
|
-
|
229
|
-
{
|
230
|
-
content: content,
|
231
|
-
metadata: metadata,
|
232
|
-
document_type: "text"
|
233
|
-
}
|
234
|
-
end
|
235
|
-
|
236
|
-
def parse_html
|
237
|
-
content = File.read(@file_path, encoding: "UTF-8")
|
238
|
-
|
239
|
-
# Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
|
240
|
-
clean_content = content
|
241
|
-
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
242
|
-
.gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
|
243
|
-
.gsub(/<[^>]+>/, " ") # Remove all HTML tags
|
244
|
-
.gsub(/\s+/, " ") # Normalize whitespace
|
245
|
-
.strip
|
246
|
-
|
247
|
-
metadata = {
|
248
|
-
file_size: File.size(@file_path),
|
249
|
-
original_format: "html"
|
250
|
-
}
|
251
|
-
|
252
|
-
{
|
253
|
-
content: clean_content,
|
254
|
-
metadata: metadata,
|
255
|
-
document_type: "html"
|
256
|
-
}
|
257
|
-
end
|
258
|
-
|
259
|
-
def parse_image
|
260
|
-
puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
|
261
|
-
|
262
|
-
metadata = {
|
263
|
-
file_size: File.size(@file_path),
|
264
|
-
file_type: @file_extension.sub(".", ""),
|
265
|
-
original_filename: File.basename(@file_path)
|
266
|
-
}
|
267
|
-
|
268
|
-
# Extract image dimensions
|
269
|
-
begin
|
270
|
-
img = Magick::Image.read(@file_path).first
|
271
|
-
metadata[:width] = img.columns
|
272
|
-
metadata[:height] = img.rows
|
273
|
-
puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
|
274
|
-
rescue StandardError => e # StandardError
|
275
|
-
puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
|
276
|
-
metadata[:width] = nil
|
277
|
-
metadata[:height] = nil
|
278
|
-
end
|
279
|
-
|
280
|
-
puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
|
281
|
-
desc = Services::ImageDescriptionService.new.generate_description(@file_path)
|
282
|
-
|
283
|
-
puts "📝 DocumentProcessor: Received description: '#{desc}'"
|
284
|
-
|
285
|
-
metadata[:description] = desc if desc && !desc.empty?
|
286
|
-
|
287
|
-
# Use AI-generated description or fallback placeholder
|
288
|
-
content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
|
289
|
-
|
290
|
-
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
291
|
-
|
292
|
-
{
|
293
|
-
content: content,
|
294
|
-
metadata: metadata,
|
295
|
-
document_type: "image"
|
296
|
-
}
|
297
|
-
end
|
298
|
-
|
299
|
-
# Helper methods for document type determination
|
300
|
-
def self.determine_document_type(file_path)
|
301
|
-
case File.extname(file_path).downcase
|
302
|
-
when ".pdf" then "pdf"
|
303
|
-
when ".docx" then "docx"
|
304
|
-
when ".txt" then "text"
|
305
|
-
when ".md", ".markdown" then "markdown"
|
306
|
-
when ".html", ".htm" then "html"
|
307
|
-
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
|
308
|
-
else "text"
|
309
|
-
end
|
310
|
-
end
|
311
|
-
|
312
|
-
def self.determine_document_type_from_content_type(content_type)
|
313
|
-
case content_type
|
314
|
-
when "application/pdf" then "pdf"
|
315
|
-
when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
|
316
|
-
when "text/plain" then "text"
|
317
|
-
when "text/markdown" then "markdown"
|
318
|
-
when "text/html" then "html"
|
319
|
-
when %r{^image/} then "image"
|
320
|
-
else "text"
|
321
|
-
end
|
322
|
-
end
|
323
|
-
|
324
|
-
def self.determine_content_type(file_path)
|
325
|
-
case File.extname(file_path).downcase
|
326
|
-
when ".pdf" then "application/pdf"
|
327
|
-
when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
328
|
-
when ".txt" then "text/plain"
|
329
|
-
when ".md", ".markdown" then "text/markdown"
|
330
|
-
when ".html", ".htm" then "text/html"
|
331
|
-
when ".jpg", ".jpeg" then "image/jpeg"
|
332
|
-
when ".png" then "image/png"
|
333
|
-
when ".gif" then "image/gif"
|
334
|
-
when ".webp" then "image/webp"
|
335
|
-
when ".bmp" then "image/bmp"
|
336
|
-
when ".svg" then "image/svg+xml"
|
337
|
-
when ".ico" then "image/x-icon"
|
338
|
-
when ".tiff", ".tif" then "image/tiff"
|
339
|
-
else "application/octet-stream"
|
340
|
-
end
|
341
|
-
end
|
342
|
-
end
|
343
|
-
end
|
344
|
-
end
|