universal_document_processor 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AI_USAGE_GUIDE.md +404 -0
- data/CHANGELOG.md +67 -0
- data/GEM_RELEASE_GUIDE.md +288 -0
- data/Gemfile +27 -0
- data/LICENSE +21 -0
- data/README.md +726 -0
- data/Rakefile +36 -0
- data/lib/universal_document_processor/ai_agent.rb +491 -0
- data/lib/universal_document_processor/document.rb +225 -0
- data/lib/universal_document_processor/processors/archive_processor.rb +290 -0
- data/lib/universal_document_processor/processors/base_processor.rb +58 -0
- data/lib/universal_document_processor/processors/character_validator.rb +283 -0
- data/lib/universal_document_processor/processors/excel_processor.rb +219 -0
- data/lib/universal_document_processor/processors/image_processor.rb +172 -0
- data/lib/universal_document_processor/processors/pdf_processor.rb +105 -0
- data/lib/universal_document_processor/processors/powerpoint_processor.rb +214 -0
- data/lib/universal_document_processor/processors/text_processor.rb +360 -0
- data/lib/universal_document_processor/processors/word_processor.rb +137 -0
- data/lib/universal_document_processor/utils/file_detector.rb +83 -0
- data/lib/universal_document_processor/utils/japanese_filename_handler.rb +205 -0
- data/lib/universal_document_processor/version.rb +3 -0
- data/lib/universal_document_processor.rb +223 -0
- metadata +198 -0
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
require "rubocop/rake_task"
|
4
|
+
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
RuboCop::RakeTask.new
|
7
|
+
|
8
|
+
desc "Run tests and linting"
|
9
|
+
task default: %i[spec rubocop]
|
10
|
+
|
11
|
+
desc "Build the gem"
|
12
|
+
task :build do
|
13
|
+
system "gem build universal_document_processor.gemspec"
|
14
|
+
end
|
15
|
+
|
16
|
+
desc "Release the gem"
|
17
|
+
task :release do
|
18
|
+
system "gem build universal_document_processor.gemspec"
|
19
|
+
system "gem push universal_document_processor-*.gem"
|
20
|
+
end
|
21
|
+
|
22
|
+
desc "Install the gem locally"
|
23
|
+
task :install do
|
24
|
+
system "gem build universal_document_processor.gemspec"
|
25
|
+
system "gem install universal_document_processor-*.gem"
|
26
|
+
end
|
27
|
+
|
28
|
+
desc "Clean build artifacts"
|
29
|
+
task :clean do
|
30
|
+
system "rm -f *.gem"
|
31
|
+
end
|
32
|
+
|
33
|
+
desc "Generate documentation"
|
34
|
+
task :doc do
|
35
|
+
system "yard doc"
|
36
|
+
end
|
@@ -0,0 +1,491 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'json'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
module UniversalDocumentProcessor
|
6
|
+
class AIAgent
|
7
|
+
attr_reader :api_key, :model, :base_url, :conversation_history
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@api_key = options[:api_key] || ENV['OPENAI_API_KEY']
|
11
|
+
@model = options[:model] || 'gpt-4'
|
12
|
+
@base_url = options[:base_url] || 'https://api.openai.com/v1'
|
13
|
+
@conversation_history = []
|
14
|
+
@max_history = options[:max_history] || 10
|
15
|
+
@temperature = options[:temperature] || 0.7
|
16
|
+
|
17
|
+
validate_configuration
|
18
|
+
end
|
19
|
+
|
20
|
+
# Main document analysis with AI
|
21
|
+
def analyze_document(document_result, query = nil)
|
22
|
+
context = build_document_context(document_result)
|
23
|
+
|
24
|
+
if query
|
25
|
+
# Specific query about the document
|
26
|
+
analyze_with_query(context, query)
|
27
|
+
else
|
28
|
+
# General document analysis
|
29
|
+
perform_general_analysis(context)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def analyze_with_query(context, query)
|
34
|
+
prompt = build_question_prompt(context, query)
|
35
|
+
response = call_openai_api(prompt)
|
36
|
+
add_to_history("Analyze document: #{query}", response)
|
37
|
+
response
|
38
|
+
end
|
39
|
+
|
40
|
+
def perform_general_analysis(context)
|
41
|
+
prompt = "You are an AI document analyst. Provide a comprehensive analysis of this document:
|
42
|
+
|
43
|
+
Document: #{context[:filename]} (#{context[:content_type]})
|
44
|
+
Size: #{format_file_size(context[:file_size])}
|
45
|
+
Images: #{context[:images_count]}
|
46
|
+
Tables: #{context[:tables_count]}
|
47
|
+
#{context[:japanese_filename] ? "Japanese filename: Yes" : ""}
|
48
|
+
|
49
|
+
Content:
|
50
|
+
#{truncate_content(context[:text_content], 3500)}
|
51
|
+
|
52
|
+
Please provide:
|
53
|
+
1. Document summary
|
54
|
+
2. Key topics and themes
|
55
|
+
3. Document structure analysis
|
56
|
+
4. Content quality assessment
|
57
|
+
5. Recommendations for use"
|
58
|
+
|
59
|
+
response = call_openai_api(prompt)
|
60
|
+
add_to_history("General document analysis", response)
|
61
|
+
response
|
62
|
+
end
|
63
|
+
|
64
|
+
# Ask specific questions about a document
|
65
|
+
def ask_document_question(document_result, question)
|
66
|
+
context = build_document_context(document_result)
|
67
|
+
|
68
|
+
prompt = build_question_prompt(context, question)
|
69
|
+
response = call_openai_api(prompt)
|
70
|
+
|
71
|
+
add_to_history(question, response)
|
72
|
+
response
|
73
|
+
end
|
74
|
+
|
75
|
+
# Summarize document content
|
76
|
+
def summarize_document(document_result, length: :medium)
|
77
|
+
context = build_document_context(document_result)
|
78
|
+
|
79
|
+
length_instruction = case length
|
80
|
+
when :short then "in 2-3 sentences"
|
81
|
+
when :medium then "in 1-2 paragraphs"
|
82
|
+
when :long then "in detail with key points"
|
83
|
+
else "concisely"
|
84
|
+
end
|
85
|
+
|
86
|
+
prompt = build_summary_prompt(context, length_instruction)
|
87
|
+
response = call_openai_api(prompt)
|
88
|
+
|
89
|
+
add_to_history("Summarize document #{length_instruction}", response)
|
90
|
+
response
|
91
|
+
end
|
92
|
+
|
93
|
+
# Extract key information from document
|
94
|
+
def extract_key_information(document_result, categories = nil)
|
95
|
+
context = build_document_context(document_result)
|
96
|
+
categories ||= ['key_facts', 'important_dates', 'names', 'locations', 'numbers']
|
97
|
+
|
98
|
+
prompt = build_extraction_prompt(context, categories)
|
99
|
+
response = call_openai_api(prompt)
|
100
|
+
|
101
|
+
add_to_history("Extract key information: #{categories.join(', ')}", response)
|
102
|
+
parse_extraction_response(response)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Translate document content
|
106
|
+
def translate_document(document_result, target_language)
|
107
|
+
context = build_document_context(document_result)
|
108
|
+
|
109
|
+
prompt = build_translation_prompt(context, target_language)
|
110
|
+
response = call_openai_api(prompt)
|
111
|
+
|
112
|
+
add_to_history("Translate to #{target_language}", response)
|
113
|
+
response
|
114
|
+
end
|
115
|
+
|
116
|
+
# Generate document insights and recommendations
|
117
|
+
def generate_insights(document_result)
|
118
|
+
context = build_document_context(document_result)
|
119
|
+
|
120
|
+
prompt = build_insights_prompt(context)
|
121
|
+
response = call_openai_api(prompt)
|
122
|
+
|
123
|
+
add_to_history("Generate insights", response)
|
124
|
+
parse_insights_response(response)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Compare multiple documents
|
128
|
+
def compare_documents(document_results, comparison_type = :content)
|
129
|
+
contexts = document_results.map { |doc| build_document_context(doc) }
|
130
|
+
|
131
|
+
prompt = build_comparison_prompt(contexts, comparison_type)
|
132
|
+
response = call_openai_api(prompt)
|
133
|
+
|
134
|
+
add_to_history("Compare documents (#{comparison_type})", response)
|
135
|
+
response
|
136
|
+
end
|
137
|
+
|
138
|
+
# Classify document type and purpose
|
139
|
+
def classify_document(document_result)
|
140
|
+
context = build_document_context(document_result)
|
141
|
+
|
142
|
+
prompt = build_classification_prompt(context)
|
143
|
+
response = call_openai_api(prompt)
|
144
|
+
|
145
|
+
add_to_history("Classify document", response)
|
146
|
+
parse_classification_response(response)
|
147
|
+
end
|
148
|
+
|
149
|
+
# Generate action items from document
|
150
|
+
def extract_action_items(document_result)
|
151
|
+
context = build_document_context(document_result)
|
152
|
+
|
153
|
+
prompt = build_action_items_prompt(context)
|
154
|
+
response = call_openai_api(prompt)
|
155
|
+
|
156
|
+
add_to_history("Extract action items", response)
|
157
|
+
parse_action_items_response(response)
|
158
|
+
end
|
159
|
+
|
160
|
+
# Chat about the document
|
161
|
+
def chat(message, document_result = nil)
|
162
|
+
if document_result
|
163
|
+
context = build_document_context(document_result)
|
164
|
+
prompt = build_chat_prompt(context, message)
|
165
|
+
else
|
166
|
+
prompt = build_general_chat_prompt(message)
|
167
|
+
end
|
168
|
+
|
169
|
+
response = call_openai_api(prompt)
|
170
|
+
add_to_history(message, response)
|
171
|
+
response
|
172
|
+
end
|
173
|
+
|
174
|
+
# Reset conversation history
|
175
|
+
def reset_conversation
|
176
|
+
@conversation_history.clear
|
177
|
+
end
|
178
|
+
|
179
|
+
# Get conversation summary
|
180
|
+
def conversation_summary
|
181
|
+
return "No conversation history" if @conversation_history.empty?
|
182
|
+
|
183
|
+
history_text = @conversation_history.map do |entry|
|
184
|
+
"Q: #{entry[:question]}\nA: #{entry[:answer]}"
|
185
|
+
end.join("\n\n")
|
186
|
+
|
187
|
+
prompt = "Summarize this conversation:\n\n#{history_text}"
|
188
|
+
call_openai_api(prompt)
|
189
|
+
end
|
190
|
+
|
191
|
+
private
|
192
|
+
|
193
|
+
def validate_configuration
|
194
|
+
raise ArgumentError, "OpenAI API key is required" unless @api_key
|
195
|
+
raise ArgumentError, "OpenAI API key cannot be empty" if @api_key.empty?
|
196
|
+
end
|
197
|
+
|
198
|
+
def build_document_context(document_result)
|
199
|
+
context = {
|
200
|
+
filename: document_result[:file_path] ? File.basename(document_result[:file_path]) : "Unknown",
|
201
|
+
content_type: document_result[:content_type] || "Unknown",
|
202
|
+
file_size: document_result[:file_size] || 0,
|
203
|
+
text_content: document_result[:text_content] || "",
|
204
|
+
metadata: document_result[:metadata] || {},
|
205
|
+
images_count: document_result[:images]&.length || 0,
|
206
|
+
tables_count: document_result[:tables]&.length || 0,
|
207
|
+
filename_info: document_result[:filename_info] || {}
|
208
|
+
}
|
209
|
+
|
210
|
+
# Add Japanese-specific information if available
|
211
|
+
if context[:filename_info][:contains_japanese]
|
212
|
+
context[:japanese_filename] = true
|
213
|
+
context[:japanese_parts] = context[:filename_info][:japanese_parts]
|
214
|
+
end
|
215
|
+
|
216
|
+
context
|
217
|
+
end
|
218
|
+
|
219
|
+
def build_question_prompt(context, question)
|
220
|
+
"You are an AI assistant analyzing a document. Here's the document information:
|
221
|
+
|
222
|
+
Filename: #{context[:filename]}
|
223
|
+
Type: #{context[:content_type]}
|
224
|
+
Size: #{format_file_size(context[:file_size])}
|
225
|
+
Images: #{context[:images_count]}
|
226
|
+
Tables: #{context[:tables_count]}
|
227
|
+
#{context[:japanese_filename] ? "Japanese filename: Yes" : ""}
|
228
|
+
|
229
|
+
Content:
|
230
|
+
#{truncate_content(context[:text_content], 3000)}
|
231
|
+
|
232
|
+
Question: #{question}
|
233
|
+
|
234
|
+
Please provide a detailed and accurate answer based on the document content."
|
235
|
+
end
|
236
|
+
|
237
|
+
def build_summary_prompt(context, length_instruction)
|
238
|
+
"You are an AI assistant. Please summarize the following document #{length_instruction}:
|
239
|
+
|
240
|
+
Document: #{context[:filename]} (#{context[:content_type]})
|
241
|
+
Content:
|
242
|
+
#{truncate_content(context[:text_content], 4000)}
|
243
|
+
|
244
|
+
Provide a clear and informative summary."
|
245
|
+
end
|
246
|
+
|
247
|
+
def build_extraction_prompt(context, categories)
|
248
|
+
"You are an AI assistant. Extract the following information from this document:
|
249
|
+
|
250
|
+
Categories to extract: #{categories.join(', ')}
|
251
|
+
|
252
|
+
Document: #{context[:filename]}
|
253
|
+
Content:
|
254
|
+
#{truncate_content(context[:text_content], 3500)}
|
255
|
+
|
256
|
+
Please provide the extracted information in a structured format with clear categories."
|
257
|
+
end
|
258
|
+
|
259
|
+
def build_translation_prompt(context, target_language)
|
260
|
+
"You are a professional translator. Translate the following document content to #{target_language}:
|
261
|
+
|
262
|
+
Document: #{context[:filename]}
|
263
|
+
Original content:
|
264
|
+
#{truncate_content(context[:text_content], 3000)}
|
265
|
+
|
266
|
+
Please provide an accurate and natural translation."
|
267
|
+
end
|
268
|
+
|
269
|
+
def build_insights_prompt(context)
|
270
|
+
"You are an AI analyst. Analyze this document and provide insights, key themes, and recommendations:
|
271
|
+
|
272
|
+
Document: #{context[:filename]} (#{context[:content_type]})
|
273
|
+
Content:
|
274
|
+
#{truncate_content(context[:text_content], 3500)}
|
275
|
+
|
276
|
+
Please provide:
|
277
|
+
1. Key themes and topics
|
278
|
+
2. Important insights
|
279
|
+
3. Potential concerns or issues
|
280
|
+
4. Recommendations or next steps
|
281
|
+
5. Overall assessment"
|
282
|
+
end
|
283
|
+
|
284
|
+
def build_comparison_prompt(contexts, comparison_type)
|
285
|
+
comparison_content = contexts.map.with_index do |context, index|
|
286
|
+
"Document #{index + 1}: #{context[:filename]}
|
287
|
+
Content: #{truncate_content(context[:text_content], 1500)}"
|
288
|
+
end.join("\n\n---\n\n")
|
289
|
+
|
290
|
+
"You are an AI analyst. Compare these documents focusing on #{comparison_type}:
|
291
|
+
|
292
|
+
#{comparison_content}
|
293
|
+
|
294
|
+
Please provide a detailed comparison highlighting similarities, differences, and key insights."
|
295
|
+
end
|
296
|
+
|
297
|
+
def build_classification_prompt(context)
|
298
|
+
"You are a document classification expert. Classify this document:
|
299
|
+
|
300
|
+
Document: #{context[:filename]} (#{context[:content_type]})
|
301
|
+
Content:
|
302
|
+
#{truncate_content(context[:text_content], 2000)}
|
303
|
+
|
304
|
+
Please classify this document by:
|
305
|
+
1. Document type (e.g., report, contract, manual, etc.)
|
306
|
+
2. Industry/domain
|
307
|
+
3. Purpose/intent
|
308
|
+
4. Urgency level
|
309
|
+
5. Target audience
|
310
|
+
|
311
|
+
Provide your classification with reasoning."
|
312
|
+
end
|
313
|
+
|
314
|
+
def build_action_items_prompt(context)
|
315
|
+
"You are an AI assistant specialized in extracting actionable items. Analyze this document:
|
316
|
+
|
317
|
+
Document: #{context[:filename]}
|
318
|
+
Content:
|
319
|
+
#{truncate_content(context[:text_content], 3000)}
|
320
|
+
|
321
|
+
Extract and list:
|
322
|
+
1. Action items or tasks mentioned
|
323
|
+
2. Deadlines or due dates
|
324
|
+
3. Responsible parties (if mentioned)
|
325
|
+
4. Priority levels
|
326
|
+
5. Dependencies
|
327
|
+
|
328
|
+
Format as a clear, actionable list."
|
329
|
+
end
|
330
|
+
|
331
|
+
def build_chat_prompt(context, message)
|
332
|
+
history_context = @conversation_history.last(5).map do |entry|
|
333
|
+
"Previous Q: #{entry[:question]}\nPrevious A: #{entry[:answer]}"
|
334
|
+
end.join("\n")
|
335
|
+
|
336
|
+
"You are an AI assistant discussing a document with a user.
|
337
|
+
|
338
|
+
Document context:
|
339
|
+
Filename: #{context[:filename]}
|
340
|
+
Type: #{context[:content_type]}
|
341
|
+
Content: #{truncate_content(context[:text_content], 2000)}
|
342
|
+
|
343
|
+
#{history_context.empty? ? "" : "Recent conversation:\n#{history_context}\n"}
|
344
|
+
|
345
|
+
User message: #{message}
|
346
|
+
|
347
|
+
Please respond helpfully based on the document and our conversation."
|
348
|
+
end
|
349
|
+
|
350
|
+
def build_general_chat_prompt(message)
|
351
|
+
history_context = @conversation_history.last(5).map do |entry|
|
352
|
+
"Q: #{entry[:question]}\nA: #{entry[:answer]}"
|
353
|
+
end.join("\n")
|
354
|
+
|
355
|
+
"You are an AI assistant helping with document processing tasks.
|
356
|
+
|
357
|
+
#{history_context.empty? ? "" : "Recent conversation:\n#{history_context}\n"}
|
358
|
+
|
359
|
+
User message: #{message}
|
360
|
+
|
361
|
+
Please respond helpfully."
|
362
|
+
end
|
363
|
+
|
364
|
+
def call_openai_api(prompt)
|
365
|
+
uri = URI("#{@base_url}/chat/completions")
|
366
|
+
|
367
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
368
|
+
http.use_ssl = true
|
369
|
+
http.read_timeout = 60
|
370
|
+
|
371
|
+
request = Net::HTTP::Post.new(uri)
|
372
|
+
request['Content-Type'] = 'application/json'
|
373
|
+
request['Authorization'] = "Bearer #{@api_key}"
|
374
|
+
|
375
|
+
request.body = {
|
376
|
+
model: @model,
|
377
|
+
messages: [
|
378
|
+
{
|
379
|
+
role: "system",
|
380
|
+
content: "You are an intelligent document processing assistant with expertise in analyzing, summarizing, and extracting information from various document types. You support multiple languages including Japanese."
|
381
|
+
},
|
382
|
+
{
|
383
|
+
role: "user",
|
384
|
+
content: prompt
|
385
|
+
}
|
386
|
+
],
|
387
|
+
temperature: @temperature,
|
388
|
+
max_tokens: 2000
|
389
|
+
}.to_json
|
390
|
+
|
391
|
+
response = http.request(request)
|
392
|
+
|
393
|
+
if response.code.to_i == 200
|
394
|
+
result = JSON.parse(response.body)
|
395
|
+
result.dig('choices', 0, 'message', 'content') || "No response generated"
|
396
|
+
else
|
397
|
+
error_body = JSON.parse(response.body) rescue response.body
|
398
|
+
raise "OpenAI API Error (#{response.code}): #{error_body}"
|
399
|
+
end
|
400
|
+
end
|
401
|
+
|
402
|
+
def add_to_history(question, answer)
|
403
|
+
@conversation_history << {
|
404
|
+
question: question,
|
405
|
+
answer: answer,
|
406
|
+
timestamp: Time.now
|
407
|
+
}
|
408
|
+
|
409
|
+
# Keep only the most recent conversations
|
410
|
+
@conversation_history = @conversation_history.last(@max_history) if @conversation_history.length > @max_history
|
411
|
+
end
|
412
|
+
|
413
|
+
def truncate_content(content, max_length)
|
414
|
+
return "" unless content.is_a?(String)
|
415
|
+
|
416
|
+
if content.length > max_length
|
417
|
+
"#{content[0...max_length]}...\n\n[Content truncated for analysis]"
|
418
|
+
else
|
419
|
+
content
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
def format_file_size(bytes)
|
424
|
+
return "0 B" if bytes == 0
|
425
|
+
|
426
|
+
units = ['B', 'KB', 'MB', 'GB']
|
427
|
+
size = bytes.to_f
|
428
|
+
unit_index = 0
|
429
|
+
|
430
|
+
while size >= 1024 && unit_index < units.length - 1
|
431
|
+
size /= 1024
|
432
|
+
unit_index += 1
|
433
|
+
end
|
434
|
+
|
435
|
+
"#{size.round(2)} #{units[unit_index]}"
|
436
|
+
end
|
437
|
+
|
438
|
+
def parse_extraction_response(response)
|
439
|
+
# Try to parse structured response
|
440
|
+
begin
|
441
|
+
# Look for JSON-like structure in response
|
442
|
+
if response.include?('{') && response.include?('}')
|
443
|
+
# Extract JSON part
|
444
|
+
json_match = response.match(/\{.*\}/m)
|
445
|
+
if json_match
|
446
|
+
return JSON.parse(json_match[0])
|
447
|
+
end
|
448
|
+
end
|
449
|
+
rescue JSON::ParserError
|
450
|
+
# Fall back to plain text response
|
451
|
+
end
|
452
|
+
|
453
|
+
response
|
454
|
+
end
|
455
|
+
|
456
|
+
def parse_insights_response(response)
|
457
|
+
{
|
458
|
+
raw_response: response,
|
459
|
+
timestamp: Time.now,
|
460
|
+
insights: extract_numbered_list(response)
|
461
|
+
}
|
462
|
+
end
|
463
|
+
|
464
|
+
def parse_classification_response(response)
|
465
|
+
{
|
466
|
+
raw_response: response,
|
467
|
+
classification: response,
|
468
|
+
timestamp: Time.now
|
469
|
+
}
|
470
|
+
end
|
471
|
+
|
472
|
+
def parse_action_items_response(response)
|
473
|
+
{
|
474
|
+
raw_response: response,
|
475
|
+
action_items: extract_numbered_list(response),
|
476
|
+
timestamp: Time.now
|
477
|
+
}
|
478
|
+
end
|
479
|
+
|
480
|
+
def extract_numbered_list(text)
|
481
|
+
# Extract numbered or bulleted lists from text
|
482
|
+
items = []
|
483
|
+
text.split("\n").each do |line|
|
484
|
+
if line.match(/^\s*[\d\-\*\•]\s*(.+)/)
|
485
|
+
items << line.strip
|
486
|
+
end
|
487
|
+
end
|
488
|
+
items
|
489
|
+
end
|
490
|
+
end
|
491
|
+
end
|