ragdoll 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +353 -0
- data/Rakefile +21 -0
- data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
- data/db/migrate/004_create_ragdoll_documents.rb +70 -0
- data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
- data/db/migrate/006_create_ragdoll_contents.rb +47 -0
- data/lib/ragdoll/core/client.rb +315 -0
- data/lib/ragdoll/core/configuration.rb +273 -0
- data/lib/ragdoll/core/database.rb +141 -0
- data/lib/ragdoll/core/document_management.rb +110 -0
- data/lib/ragdoll/core/document_processor.rb +344 -0
- data/lib/ragdoll/core/embedding_service.rb +183 -0
- data/lib/ragdoll/core/errors.rb +11 -0
- data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
- data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
- data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
- data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
- data/lib/ragdoll/core/metadata_schemas.rb +334 -0
- data/lib/ragdoll/core/models/audio_content.rb +175 -0
- data/lib/ragdoll/core/models/content.rb +126 -0
- data/lib/ragdoll/core/models/document.rb +678 -0
- data/lib/ragdoll/core/models/embedding.rb +204 -0
- data/lib/ragdoll/core/models/image_content.rb +227 -0
- data/lib/ragdoll/core/models/text_content.rb +169 -0
- data/lib/ragdoll/core/search_engine.rb +50 -0
- data/lib/ragdoll/core/services/image_description_service.rb +230 -0
- data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
- data/lib/ragdoll/core/shrine_config.rb +71 -0
- data/lib/ragdoll/core/text_chunker.rb +210 -0
- data/lib/ragdoll/core/text_generation_service.rb +360 -0
- data/lib/ragdoll/core/version.rb +8 -0
- data/lib/ragdoll/core.rb +73 -0
- data/lib/ragdoll-core.rb +3 -0
- data/lib/ragdoll.rb +249 -0
- data/lib/tasks/annotate.rake +126 -0
- data/lib/tasks/db.rake +338 -0
- metadata +80 -0
@@ -0,0 +1,141 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_record"
|
4
|
+
require "logger"
|
5
|
+
|
6
|
+
module Ragdoll
|
7
|
+
module Core
|
8
|
+
class Database
|
9
|
+
def self.setup(config = {})
|
10
|
+
database_config = default_config.merge(config)
|
11
|
+
|
12
|
+
# Set up ActiveRecord connection
|
13
|
+
ActiveRecord::Base.establish_connection(database_config)
|
14
|
+
|
15
|
+
# Set up logging if specified
|
16
|
+
ActiveRecord::Base.logger = database_config[:logger] if database_config[:logger]
|
17
|
+
|
18
|
+
# Auto-migrate if specified
|
19
|
+
return unless database_config[:auto_migrate]
|
20
|
+
|
21
|
+
migrate!
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.migrate!
|
25
|
+
# Get the path to the gem root directory
|
26
|
+
# Current file is lib/ragdoll/core/database.rb, so go up 3 levels to get to gem root
|
27
|
+
gem_root = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", ".."))
|
28
|
+
migration_paths = [
|
29
|
+
File.join(gem_root, "db", "migrate")
|
30
|
+
]
|
31
|
+
|
32
|
+
ActiveRecord::Migration.verbose = true
|
33
|
+
|
34
|
+
# Ensure schema_migrations table exists first
|
35
|
+
unless ActiveRecord::Base.connection.table_exists?("schema_migrations")
|
36
|
+
ActiveRecord::Base.connection.create_table("schema_migrations", id: false) do |t|
|
37
|
+
t.string :version, null: false
|
38
|
+
end
|
39
|
+
ActiveRecord::Base.connection.add_index("schema_migrations", :version, unique: true)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Debug migration path (silenced for clean test output)
|
43
|
+
# puts "Migration path: #{migration_paths.first}" if ActiveRecord::Migration.verbose
|
44
|
+
migration_files = Dir[File.join(migration_paths.first, "*.rb")].sort
|
45
|
+
# puts "Found #{migration_files.length} migration files" if ActiveRecord::Migration.verbose
|
46
|
+
|
47
|
+
# Load and run each migration manually since ActiveRecord migration context seems broken
|
48
|
+
migration_files.each do |migration_file|
|
49
|
+
# Extract version from filename
|
50
|
+
version = File.basename(migration_file, ".rb").split("_").first
|
51
|
+
|
52
|
+
# Skip if already migrated
|
53
|
+
next if ActiveRecord::Base.connection.select_values(
|
54
|
+
"SELECT version FROM schema_migrations WHERE version = '#{version}'"
|
55
|
+
).any?
|
56
|
+
|
57
|
+
# Load the migration file to define the class
|
58
|
+
require migration_file
|
59
|
+
|
60
|
+
# Get the migration class - convert snake_case to CamelCase
|
61
|
+
filename_parts = File.basename(migration_file, ".rb").split("_")[1..]
|
62
|
+
migration_class_name = filename_parts.map(&:capitalize).join
|
63
|
+
|
64
|
+
begin
|
65
|
+
migration_class = Object.const_get(migration_class_name)
|
66
|
+
rescue NameError
|
67
|
+
puts "Warning: Could not find migration class #{migration_class_name} in #{migration_file}"
|
68
|
+
next
|
69
|
+
end
|
70
|
+
|
71
|
+
# Run the migration quietly
|
72
|
+
old_verbose = ActiveRecord::Migration.verbose
|
73
|
+
ActiveRecord::Migration.verbose = false
|
74
|
+
migration_class.migrate(:up)
|
75
|
+
ActiveRecord::Migration.verbose = old_verbose
|
76
|
+
|
77
|
+
# Record the migration
|
78
|
+
ActiveRecord::Base.connection.insert(
|
79
|
+
"INSERT INTO schema_migrations (version) VALUES ('#{version}')"
|
80
|
+
)
|
81
|
+
|
82
|
+
# Silenced migration progress - uncomment for debugging
|
83
|
+
# puts "Migrated #{migration_class_name}" if ActiveRecord::Migration.verbose
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.reset!
|
88
|
+
ActiveRecord::Migration.verbose = false
|
89
|
+
|
90
|
+
# Drop all tables in correct order (respecting foreign key constraints)
|
91
|
+
# Order: dependent tables first, then parent tables
|
92
|
+
tables_to_drop = %w[
|
93
|
+
ragdoll_embeddings
|
94
|
+
ragdoll_text_contents
|
95
|
+
ragdoll_image_contents
|
96
|
+
ragdoll_audio_contents
|
97
|
+
ragdoll_documents
|
98
|
+
schema_migrations
|
99
|
+
]
|
100
|
+
|
101
|
+
tables_to_drop.each do |table|
|
102
|
+
if ActiveRecord::Base.connection.table_exists?(table)
|
103
|
+
# For PostgreSQL, we can use CASCADE to drop dependent objects
|
104
|
+
if ActiveRecord::Base.connection.adapter_name.downcase.include?("postgresql")
|
105
|
+
ActiveRecord::Base.connection.execute("DROP TABLE IF EXISTS #{table} CASCADE")
|
106
|
+
else
|
107
|
+
ActiveRecord::Base.connection.drop_table(table)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
migrate!
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.connected?
|
116
|
+
ActiveRecord::Base.connected?
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.disconnect!
|
120
|
+
ActiveRecord::Base.clear_all_connections!
|
121
|
+
end
|
122
|
+
|
123
|
+
def self.default_config
|
124
|
+
{
|
125
|
+
adapter: "postgresql",
|
126
|
+
database: "ragdoll_development",
|
127
|
+
username: "ragdoll",
|
128
|
+
password: ENV["RAGDOLL_DATABASE_PASSWORD"],
|
129
|
+
host: "localhost",
|
130
|
+
port: 5432,
|
131
|
+
auto_migrate: true,
|
132
|
+
logger: Logger.new($stdout, level: Logger::WARN)
|
133
|
+
}
|
134
|
+
end
|
135
|
+
|
136
|
+
def self.migration_paths
|
137
|
+
[File.join(File.dirname(__FILE__), "..", "..", "..", "..", "db", "migrate")]
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
module Core
|
5
|
+
class DocumentManagement
|
6
|
+
class << self
|
7
|
+
def add_document(location, content, metadata = {})
|
8
|
+
# Ensure location is an absolute path if it's a file path
|
9
|
+
absolute_location = location.start_with?("http") || location.start_with?("ftp") ? location : File.expand_path(location)
|
10
|
+
|
11
|
+
# Get file modification time if it's a file path
|
12
|
+
file_modified_at = if File.exist?(absolute_location) && !absolute_location.start_with?("http")
|
13
|
+
File.mtime(absolute_location)
|
14
|
+
else
|
15
|
+
Time.current
|
16
|
+
end
|
17
|
+
|
18
|
+
# Check if document already exists with same location and file_modified_at
|
19
|
+
existing_document = Models::Document.find_by(
|
20
|
+
location: absolute_location,
|
21
|
+
file_modified_at: file_modified_at
|
22
|
+
)
|
23
|
+
|
24
|
+
# Return existing document ID if found (skip duplicate)
|
25
|
+
return existing_document.id.to_s if existing_document
|
26
|
+
|
27
|
+
document = Models::Document.create!(
|
28
|
+
location: absolute_location,
|
29
|
+
title: metadata[:title] || metadata["title"] || extract_title_from_location(location),
|
30
|
+
document_type: metadata[:document_type] || metadata["document_type"] || "text",
|
31
|
+
metadata: metadata.is_a?(Hash) ? metadata : {},
|
32
|
+
status: "pending",
|
33
|
+
file_modified_at: file_modified_at
|
34
|
+
)
|
35
|
+
|
36
|
+
# Set content using the model's setter to trigger TextContent creation
|
37
|
+
document.content = content if content.present?
|
38
|
+
|
39
|
+
document.id.to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
def get_document(id)
|
43
|
+
document = Models::Document.find_by(id: id)
|
44
|
+
return nil unless document
|
45
|
+
|
46
|
+
hash = document.to_hash
|
47
|
+
hash[:content] = document.content
|
48
|
+
hash
|
49
|
+
end
|
50
|
+
|
51
|
+
def update_document(id, **updates)
|
52
|
+
document = Models::Document.find_by(id: id)
|
53
|
+
return nil unless document
|
54
|
+
|
55
|
+
# Only update allowed fields
|
56
|
+
allowed_updates = updates.slice(:title, :metadata, :status, :document_type)
|
57
|
+
document.update!(allowed_updates) if allowed_updates.any?
|
58
|
+
|
59
|
+
document.to_hash
|
60
|
+
end
|
61
|
+
|
62
|
+
def delete_document(id)
|
63
|
+
document = Models::Document.find_by(id: id)
|
64
|
+
return nil unless document
|
65
|
+
|
66
|
+
document.destroy!
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
def list_documents(options = {})
|
71
|
+
limit = options[:limit] || 100
|
72
|
+
offset = options[:offset] || 0
|
73
|
+
|
74
|
+
Models::Document.offset(offset).limit(limit).recent.map(&:to_hash)
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_document_stats
|
78
|
+
Models::Document.stats
|
79
|
+
end
|
80
|
+
|
81
|
+
# FIXME: should this be here?
|
82
|
+
|
83
|
+
def add_embedding(embeddable_id, chunk_index, embedding_vector, metadata = {})
|
84
|
+
# The embeddable_type should be the actual STI subclass, not the base class
|
85
|
+
embeddable_type = if metadata[:embeddable_type]
|
86
|
+
metadata[:embeddable_type]
|
87
|
+
else
|
88
|
+
# Look up the actual STI type from the content record
|
89
|
+
content = Models::Content.find(embeddable_id)
|
90
|
+
content.class.name
|
91
|
+
end
|
92
|
+
|
93
|
+
Models::Embedding.create!(
|
94
|
+
embeddable_id: embeddable_id,
|
95
|
+
embeddable_type: embeddable_type,
|
96
|
+
chunk_index: chunk_index,
|
97
|
+
embedding_vector: embedding_vector,
|
98
|
+
content: metadata[:content] || ""
|
99
|
+
).id.to_s
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
def extract_title_from_location(location)
|
105
|
+
File.basename(location, File.extname(location))
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,344 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "pdf-reader"
|
4
|
+
require "docx"
|
5
|
+
require "rmagick"
|
6
|
+
require_relative "services/image_description_service"
|
7
|
+
|
8
|
+
module Ragdoll
|
9
|
+
module Core
|
10
|
+
class DocumentProcessor
|
11
|
+
class ParseError < DocumentError; end
|
12
|
+
class UnsupportedFormatError < ParseError; end
|
13
|
+
|
14
|
+
def self.parse(file_path)
|
15
|
+
new(file_path).parse
|
16
|
+
end
|
17
|
+
|
18
|
+
# Parse from Shrine attached file
|
19
|
+
def self.parse_attachment(attached_file)
|
20
|
+
attached_file.open do |tempfile|
|
21
|
+
new(tempfile.path, attached_file).parse
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Create document from file path
|
26
|
+
def self.create_document_from_file(file_path, **options)
|
27
|
+
parsed = parse(file_path)
|
28
|
+
|
29
|
+
# Get file modification time
|
30
|
+
file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current
|
31
|
+
|
32
|
+
document = Models::Document.create!(
|
33
|
+
location: File.expand_path(file_path),
|
34
|
+
title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
|
35
|
+
content: parsed[:content],
|
36
|
+
document_type: determine_document_type(file_path),
|
37
|
+
metadata: parsed[:metadata] || {},
|
38
|
+
status: "processed",
|
39
|
+
file_modified_at: file_modified_at,
|
40
|
+
**options
|
41
|
+
)
|
42
|
+
|
43
|
+
# Attach the file if it exists
|
44
|
+
document.file = File.open(file_path) if File.exist?(file_path)
|
45
|
+
|
46
|
+
document
|
47
|
+
end
|
48
|
+
|
49
|
+
# Create document from uploaded file (Shrine compatible)
|
50
|
+
def self.create_document_from_upload(uploaded_file, **options)
|
51
|
+
# Create document first
|
52
|
+
document = Models::Document.create!(
|
53
|
+
location: uploaded_file.original_filename || "uploaded_file",
|
54
|
+
title: options[:title] || File.basename(uploaded_file.original_filename || "uploaded_file",
|
55
|
+
File.extname(uploaded_file.original_filename || "")),
|
56
|
+
content: "", # Will be extracted after file attachment
|
57
|
+
document_type: determine_document_type_from_content_type(uploaded_file.mime_type),
|
58
|
+
status: "processing",
|
59
|
+
metadata: options[:metadata] || {},
|
60
|
+
file_modified_at: Time.current
|
61
|
+
)
|
62
|
+
|
63
|
+
# Attach the file
|
64
|
+
document.file = uploaded_file
|
65
|
+
|
66
|
+
# Extract content from attached file
|
67
|
+
if document.file.present?
|
68
|
+
parsed = parse_attachment(document.file)
|
69
|
+
document.update!(
|
70
|
+
content: parsed[:content],
|
71
|
+
title: parsed[:title] || document.title,
|
72
|
+
metadata: document.metadata.merge(parsed[:metadata] || {}),
|
73
|
+
status: "processed"
|
74
|
+
)
|
75
|
+
end
|
76
|
+
|
77
|
+
document
|
78
|
+
end
|
79
|
+
|
80
|
+
def initialize(file_path, attached_file = nil)
|
81
|
+
@file_path = file_path
|
82
|
+
@attached_file = attached_file
|
83
|
+
@file_extension = File.extname(file_path).downcase
|
84
|
+
end
|
85
|
+
|
86
|
+
def parse
|
87
|
+
case @file_extension
|
88
|
+
when ".pdf"
|
89
|
+
parse_pdf
|
90
|
+
when ".docx"
|
91
|
+
parse_docx
|
92
|
+
when ".txt", ".md", ".markdown"
|
93
|
+
parse_text
|
94
|
+
when ".html", ".htm"
|
95
|
+
parse_html
|
96
|
+
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif"
|
97
|
+
parse_image
|
98
|
+
else
|
99
|
+
parse_text # Default to text parsing for unknown formats
|
100
|
+
end
|
101
|
+
rescue StandardError => e # StandardError => e
|
102
|
+
raise ParseError, "#{__LINE__} Failed to parse #{@file_path}: #{e.message}"
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def parse_pdf
|
108
|
+
content = ""
|
109
|
+
metadata = {}
|
110
|
+
|
111
|
+
begin
|
112
|
+
PDF::Reader.open(@file_path) do |reader|
|
113
|
+
# Extract metadata
|
114
|
+
if reader.info
|
115
|
+
metadata[:title] = reader.info[:Title] if reader.info[:Title]
|
116
|
+
metadata[:author] = reader.info[:Author] if reader.info[:Author]
|
117
|
+
metadata[:subject] = reader.info[:Subject] if reader.info[:Subject]
|
118
|
+
metadata[:creator] = reader.info[:Creator] if reader.info[:Creator]
|
119
|
+
metadata[:producer] = reader.info[:Producer] if reader.info[:Producer]
|
120
|
+
metadata[:creation_date] = reader.info[:CreationDate] if reader.info[:CreationDate]
|
121
|
+
metadata[:modification_date] = reader.info[:ModDate] if reader.info[:ModDate]
|
122
|
+
end
|
123
|
+
|
124
|
+
metadata[:page_count] = reader.page_count
|
125
|
+
|
126
|
+
# Extract text from all pages
|
127
|
+
reader.pages.each_with_index do |page, index|
|
128
|
+
page_text = page.text.strip
|
129
|
+
next if page_text.empty?
|
130
|
+
|
131
|
+
content += "\n\n--- Page #{index + 1} ---\n\n" if content.length.positive?
|
132
|
+
content += page_text
|
133
|
+
end
|
134
|
+
end
|
135
|
+
rescue PDF::Reader::MalformedPDFError => e
|
136
|
+
raise ParseError, "Malformed PDF: #{e.message}"
|
137
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
138
|
+
raise ParseError, "Unsupported PDF feature: #{e.message}"
|
139
|
+
end
|
140
|
+
|
141
|
+
{
|
142
|
+
content: content.strip,
|
143
|
+
metadata: metadata,
|
144
|
+
document_type: "pdf"
|
145
|
+
}
|
146
|
+
end
|
147
|
+
|
148
|
+
def parse_docx
|
149
|
+
content = ""
|
150
|
+
metadata = {}
|
151
|
+
|
152
|
+
begin
|
153
|
+
doc = Docx::Document.open(@file_path)
|
154
|
+
|
155
|
+
# Extract core properties
|
156
|
+
if doc.core_properties
|
157
|
+
metadata[:title] = doc.core_properties.title if doc.core_properties.title
|
158
|
+
metadata[:author] = doc.core_properties.creator if doc.core_properties.creator
|
159
|
+
metadata[:subject] = doc.core_properties.subject if doc.core_properties.subject
|
160
|
+
metadata[:description] = doc.core_properties.description if doc.core_properties.description
|
161
|
+
metadata[:keywords] = doc.core_properties.keywords if doc.core_properties.keywords
|
162
|
+
metadata[:created] = doc.core_properties.created if doc.core_properties.created
|
163
|
+
metadata[:modified] = doc.core_properties.modified if doc.core_properties.modified
|
164
|
+
if doc.core_properties.last_modified_by
|
165
|
+
metadata[:last_modified_by] =
|
166
|
+
doc.core_properties.last_modified_by
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Extract text from paragraphs
|
171
|
+
doc.paragraphs.each do |paragraph|
|
172
|
+
paragraph_text = paragraph.text.strip
|
173
|
+
next if paragraph_text.empty?
|
174
|
+
|
175
|
+
content += "#{paragraph_text}\n\n"
|
176
|
+
end
|
177
|
+
|
178
|
+
# Extract text from tables
|
179
|
+
doc.tables.each_with_index do |table, table_index|
|
180
|
+
content += "\n--- Table #{table_index + 1} ---\n\n"
|
181
|
+
|
182
|
+
table.rows.each do |row|
|
183
|
+
row_text = row.cells.map(&:text).join(" | ")
|
184
|
+
content += "#{row_text}\n" unless row_text.strip.empty?
|
185
|
+
end
|
186
|
+
|
187
|
+
content += "\n"
|
188
|
+
end
|
189
|
+
|
190
|
+
metadata[:paragraph_count] = doc.paragraphs.count
|
191
|
+
metadata[:table_count] = doc.tables.count
|
192
|
+
rescue StandardError => e # StandardError => e
|
193
|
+
raise ParseError, "#{__LINE__} Failed to parse DOCX: #{e.message}"
|
194
|
+
end
|
195
|
+
|
196
|
+
{
|
197
|
+
content: content.strip,
|
198
|
+
metadata: metadata,
|
199
|
+
document_type: "docx"
|
200
|
+
}
|
201
|
+
end
|
202
|
+
|
203
|
+
def parse_text
|
204
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
205
|
+
metadata = {
|
206
|
+
file_size: File.size(@file_path),
|
207
|
+
encoding: "UTF-8"
|
208
|
+
}
|
209
|
+
|
210
|
+
document_type = case @file_extension
|
211
|
+
when ".md", ".markdown" then "markdown"
|
212
|
+
when ".txt" then "text"
|
213
|
+
else "text"
|
214
|
+
end
|
215
|
+
|
216
|
+
{
|
217
|
+
content: content,
|
218
|
+
metadata: metadata,
|
219
|
+
document_type: document_type
|
220
|
+
}
|
221
|
+
rescue Encoding::InvalidByteSequenceError
|
222
|
+
# Try with different encoding
|
223
|
+
content = File.read(@file_path, encoding: "ISO-8859-1")
|
224
|
+
metadata = {
|
225
|
+
file_size: File.size(@file_path),
|
226
|
+
encoding: "ISO-8859-1"
|
227
|
+
}
|
228
|
+
|
229
|
+
{
|
230
|
+
content: content,
|
231
|
+
metadata: metadata,
|
232
|
+
document_type: "text"
|
233
|
+
}
|
234
|
+
end
|
235
|
+
|
236
|
+
def parse_html
|
237
|
+
content = File.read(@file_path, encoding: "UTF-8")
|
238
|
+
|
239
|
+
# Basic HTML tag stripping (for more advanced parsing, consider using Nokogiri)
|
240
|
+
clean_content = content
|
241
|
+
.gsub(%r{<script[^>]*>.*?</script>}mi, "") # Remove script tags
|
242
|
+
.gsub(%r{<style[^>]*>.*?</style>}mi, "") # Remove style tags
|
243
|
+
.gsub(/<[^>]+>/, " ") # Remove all HTML tags
|
244
|
+
.gsub(/\s+/, " ") # Normalize whitespace
|
245
|
+
.strip
|
246
|
+
|
247
|
+
metadata = {
|
248
|
+
file_size: File.size(@file_path),
|
249
|
+
original_format: "html"
|
250
|
+
}
|
251
|
+
|
252
|
+
{
|
253
|
+
content: clean_content,
|
254
|
+
metadata: metadata,
|
255
|
+
document_type: "html"
|
256
|
+
}
|
257
|
+
end
|
258
|
+
|
259
|
+
def parse_image
|
260
|
+
puts "🖼️ DocumentProcessor: Starting image parsing for #{@file_path}"
|
261
|
+
|
262
|
+
metadata = {
|
263
|
+
file_size: File.size(@file_path),
|
264
|
+
file_type: @file_extension.sub(".", ""),
|
265
|
+
original_filename: File.basename(@file_path)
|
266
|
+
}
|
267
|
+
|
268
|
+
# Extract image dimensions
|
269
|
+
begin
|
270
|
+
img = Magick::Image.read(@file_path).first
|
271
|
+
metadata[:width] = img.columns
|
272
|
+
metadata[:height] = img.rows
|
273
|
+
puts "📏 DocumentProcessor: Image dimensions: #{img.columns}x#{img.rows}"
|
274
|
+
rescue StandardError => e # StandardError
|
275
|
+
puts "❌ DocumentProcessor: Failed to get image dimensions: #{e.message}"
|
276
|
+
metadata[:width] = nil
|
277
|
+
metadata[:height] = nil
|
278
|
+
end
|
279
|
+
|
280
|
+
puts "🤖 DocumentProcessor: Creating ImageDescriptionService and calling generate_description..."
|
281
|
+
desc = Services::ImageDescriptionService.new.generate_description(@file_path)
|
282
|
+
|
283
|
+
puts "📝 DocumentProcessor: Received description: '#{desc}'"
|
284
|
+
|
285
|
+
metadata[:description] = desc if desc && !desc.empty?
|
286
|
+
|
287
|
+
# Use AI-generated description or fallback placeholder
|
288
|
+
content = desc && !desc.empty? ? desc : "Image file: #{File.basename(@file_path)}"
|
289
|
+
|
290
|
+
puts "✅ DocumentProcessor: Image parsing complete. Content: '#{content[0..100]}...'"
|
291
|
+
|
292
|
+
{
|
293
|
+
content: content,
|
294
|
+
metadata: metadata,
|
295
|
+
document_type: "image"
|
296
|
+
}
|
297
|
+
end
|
298
|
+
|
299
|
+
# Helper methods for document type determination
|
300
|
+
def self.determine_document_type(file_path)
|
301
|
+
case File.extname(file_path).downcase
|
302
|
+
when ".pdf" then "pdf"
|
303
|
+
when ".docx" then "docx"
|
304
|
+
when ".txt" then "text"
|
305
|
+
when ".md", ".markdown" then "markdown"
|
306
|
+
when ".html", ".htm" then "html"
|
307
|
+
when ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif" then "image"
|
308
|
+
else "text"
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def self.determine_document_type_from_content_type(content_type)
|
313
|
+
case content_type
|
314
|
+
when "application/pdf" then "pdf"
|
315
|
+
when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
|
316
|
+
when "text/plain" then "text"
|
317
|
+
when "text/markdown" then "markdown"
|
318
|
+
when "text/html" then "html"
|
319
|
+
when %r{^image/} then "image"
|
320
|
+
else "text"
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
def self.determine_content_type(file_path)
|
325
|
+
case File.extname(file_path).downcase
|
326
|
+
when ".pdf" then "application/pdf"
|
327
|
+
when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
328
|
+
when ".txt" then "text/plain"
|
329
|
+
when ".md", ".markdown" then "text/markdown"
|
330
|
+
when ".html", ".htm" then "text/html"
|
331
|
+
when ".jpg", ".jpeg" then "image/jpeg"
|
332
|
+
when ".png" then "image/png"
|
333
|
+
when ".gif" then "image/gif"
|
334
|
+
when ".webp" then "image/webp"
|
335
|
+
when ".bmp" then "image/bmp"
|
336
|
+
when ".svg" then "image/svg+xml"
|
337
|
+
when ".ico" then "image/x-icon"
|
338
|
+
when ".tiff", ".tif" then "image/tiff"
|
339
|
+
else "application/octet-stream"
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|