smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,483 @@
1
+ require_relative '../parsers/query_parser'
2
+
3
+ module SmartRAG
4
+ module Core
5
+ # FulltextManager handles full-text search functionality and tsvector indexes
6
+ # Supports multi-language tokenization and language detection
7
+ class FulltextManager
8
+ attr_reader :db, :query_parser, :logger
9
+
10
+ # Weights for tsvector fields (A-D, A highest)
11
+ WEIGHTS = {
12
+ title: 'A',
13
+ content: 'B'
14
+ }.freeze
15
+
16
+ # Default search configuration
17
+ DEFAULT_CONFIG = {
18
+ max_results: 100,
19
+ default_language: 'en',
20
+ result_limits: 20
21
+ }.freeze
22
+
23
+ # Initialize FulltextManager
24
+ # @param db [Sequel::Database] Database connection
25
+ # @param options [Hash] Configuration options
26
+ def initialize(db, options = {})
27
+ @db = db
28
+ @query_parser = options[:query_parser] || Parsers::QueryParser.new
29
+ @logger = options[:logger] || Logger.new(STDOUT)
30
+ @config = DEFAULT_CONFIG.merge(options)
31
+ end
32
+
33
+ # Store or update full-text index for a section
34
+ # @param section_id [Integer] Section ID
35
+ # @param title [String] Section title
36
+ # @param content [String] Section content
37
+ # @param language [String] Language code
38
+ # @return [Boolean] Success status
39
+ def update_fulltext_index(section_id, title, content, language = 'en')
40
+ raise ArgumentError, 'Section ID cannot be nil' unless section_id
41
+ raise ArgumentError, 'Content cannot be nil' unless content
42
+
43
+ # Decide configs for single or mixed-language tsvector
44
+ configs = build_language_configs(language, "#{title}\n\n#{content}")
45
+
46
+ # Prepare tsvector values
47
+ ts_title = if title.to_s.strip.empty?
48
+ ''
49
+ else
50
+ build_weighted_vector(configs, title, WEIGHTS[:title])
51
+ end
52
+
53
+ ts_content = build_weighted_vector(configs, content, WEIGHTS[:content])
54
+
55
+ # Combine with weights
56
+ ts_combined = if ts_title.empty?
57
+ ts_content
58
+ else
59
+ # Use SQL concatenation for tsvector
60
+ Sequel.lit("#{ts_title} || #{ts_content}")
61
+ end
62
+
63
+ # Update or insert into section_fts table
64
+ db[:section_fts].insert_conflict(
65
+ target: :section_id,
66
+ update: {
67
+ language: language,
68
+ fts_title: ts_title,
69
+ fts_content: ts_content,
70
+ fts_combined: ts_combined,
71
+ updated_at: Sequel::CURRENT_TIMESTAMP
72
+ }
73
+ ).insert(
74
+ section_id: section_id,
75
+ language: language,
76
+ fts_title: ts_title,
77
+ fts_content: ts_content,
78
+ fts_combined: ts_combined
79
+ )
80
+
81
+ @logger.info "Updated full-text index for section #{section_id}"
82
+ true
83
+ rescue Sequel::Error => e
84
+ @logger.error "Failed to update full-text index for section #{section_id}: #{e.message}"
85
+ @logger.error e.backtrace.join("\n")
86
+ false
87
+ rescue StandardError => e
88
+ # Re-raise ArgumentError and other programming errors
89
+ raise e if e.is_a?(ArgumentError)
90
+
91
+ @logger.error "Failed to update full-text index for section #{section_id}: #{e.message}"
92
+ @logger.error e.backtrace.join("\n")
93
+ raise Errors::FulltextSearchError, e.message
94
+ end
95
+
96
+ # Batch update full-text indexes
97
+ # @param sections [Array<Hash>] Array of section data
98
+ # @return [Hash] Success/failure counts
99
+ def batch_update_fulltext(sections)
100
+ results = { success: 0, failed: 0, errors: [] }
101
+
102
+ sections.each do |section|
103
+ success = update_fulltext_index(
104
+ section[:id],
105
+ section[:title],
106
+ section[:content],
107
+ section[:language] || 'en'
108
+ )
109
+
110
+ if success
111
+ results[:success] += 1
112
+ else
113
+ results[:failed] += 1
114
+ results[:errors] << { section_id: section[:id], error: 'Update failed' }
115
+ end
116
+ rescue StandardError => e
117
+ results[:failed] += 1
118
+ results[:errors] << { section_id: section[:id], error: e.message }
119
+ @logger.error "Batch update failed for section #{section[:id]}: #{e.message}"
120
+ end
121
+
122
+ @logger.info "Batch updated #{results[:success]} full-text indexes, #{results[:failed]} failed"
123
+ results
124
+ end
125
+
126
+ # Basic full-text search
127
+ # @param query [String] Search query text
128
+ # @param language [String] Language code (auto-detect if nil)
129
+ # @param limit [Integer] Maximum results
130
+ # @param options [Hash] Additional options
131
+ # @return [Array] Search results
132
+ def search_by_text(query, language = nil, limit = 20, options = {})
133
+ raise ArgumentError, 'Query cannot be nil' if query.nil?
134
+ raise ArgumentError, 'Query cannot be empty' if query.strip.empty?
135
+
136
+ # Detect language if not provided
137
+ language ||= detect_language_from_query(query)
138
+
139
+ # Build tsquery
140
+ tsquery = build_tsquery(query, language)
141
+
142
+ # Check if db is valid and table exists before querying
143
+ return [] if db.nil? || !db.respond_to?(:table_exists?) || !db.table_exists?(:section_fts)
144
+
145
+ # Base dataset
146
+ dataset = db[:section_fts]
147
+ .select(
148
+ Sequel[:section_fts][:section_id],
149
+ Sequel[:section_fts][:language],
150
+ Sequel[:source_sections][:section_title],
151
+ Sequel[:source_sections][:content],
152
+ Sequel[:source_sections][:document_id],
153
+ Sequel.function(
154
+ :ts_rank,
155
+ Sequel[:section_fts][:fts_combined],
156
+ Sequel.lit(tsquery)
157
+ ).as(:rank_score),
158
+ Sequel.function(
159
+ :ts_headline,
160
+ Sequel[:source_sections][:content],
161
+ Sequel.lit(tsquery),
162
+ 'MaxWords=50, MinWords=15, MaxFragments=3'
163
+ ).as(:highlight)
164
+ )
165
+ .join(:source_sections, id: Sequel[:section_fts][:section_id])
166
+ .where(Sequel.lit("section_fts.fts_combined @@ #{tsquery}"))
167
+ .order(Sequel.desc(:rank_score))
168
+ .limit(limit)
169
+
170
+ @logger.debug "Fulltext search SQL: #{dataset.sql}"
171
+ @logger.debug "Fulltext search tsquery: #{tsquery}"
172
+
173
+ # Apply filters if provided
174
+ dataset = apply_search_filters(dataset, options[:filters])
175
+
176
+ # Execute query and format results
177
+ results = dataset.all.map do |row|
178
+ format_search_result(row, query)
179
+ end
180
+
181
+ @logger.info "Full-text search returned #{results.length} results" if results.any?
182
+
183
+ results
184
+ rescue StandardError => e
185
+ # Re-raise ArgumentError and other programming errors
186
+ raise e if e.is_a?(ArgumentError)
187
+
188
+ @logger.error "Full-text search failed: #{e.message}"
189
+ @logger.error e.backtrace.join("\n")
190
+ raise Errors::FulltextSearchError, "Search failed: #{e.message}"
191
+ end
192
+
193
+ # Full-text search with filters
194
+ # @param query [String] Search query
195
+ # @param filters [Hash] Filter options
196
+ # @param options [Hash] Search options
197
+ # @return [Array] Filtered results
198
+ def search_with_filters(query, filters, options = {})
199
+ options[:filters] = filters
200
+ search_by_text(query, nil, options[:limit] || 20, options)
201
+ end
202
+
203
+ # Hybrid search combining text and vector search
204
+ # @param text_query [String] Full-text query
205
+ # @param vector_query [Array] Vector embedding
206
+ # @param options [Hash] Search options
207
+ # @return [Array] Combined results
208
+ def hybrid_search(text_query, vector_query, options = {})
209
+ raise ArgumentError, 'Text query or vector query must be provided' if text_query.nil? && vector_query.nil?
210
+
211
+ limit = options[:limit] || 20
212
+ k = options[:rrf_k] || 60 # RRF fusion parameter
213
+
214
+ # Get results from both search methods
215
+ text_results = text_query ? search_by_text(text_query, nil, limit * 2) : []
216
+
217
+ # Vector search would be called here in real implementation
218
+ # For now, we'll simulate or call a provided block
219
+ vector_results = []
220
+ vector_results = yield(vector_query, limit * 2) if block_given?
221
+
222
+ # Combine results using RRF (Reciprocal Rank Fusion)
223
+ combined = combine_results_with_rrf(text_results, vector_results, k: k)
224
+
225
+ # Limit final results
226
+ combined.first(limit)
227
+ rescue StandardError => e
228
+ @logger.error "Hybrid search failed: #{e.message}"
229
+ @logger.error e.backtrace.join("\n")
230
+ raise Errors::HybridSearchError, "Hybrid search failed: #{e.message}"
231
+ end
232
+
233
+ # Detect language for given text
234
+ # @param text [String] Text to analyze
235
+ # @return [String] Language code
236
+ def detect_language(text)
237
+ @query_parser.detect_language(text)
238
+ end
239
+
240
+ # Build tsquery for given text and language
241
+ # @param text [String] Search text
242
+ # @param language [String] Language code
243
+ # @return [String] tsquery string
244
+ def build_tsquery(text, language)
245
+ @logger.debug "FulltextManager.build_tsquery called with text='#{text}', language='#{language}' (class: #{language.class})"
246
+ @query_parser.build_tsquery(text, language)
247
+ end
248
+
249
+ # Parse advanced query
250
+ # @param text [String] Query text
251
+ # @return [Hash] Parsed query structure
252
+ def parse_advanced_query(text)
253
+ @query_parser.parse_advanced_query(text)
254
+ end
255
+
256
+ # Get search statistics
257
+ # @return [Hash] Statistics
258
+ def stats
259
+ {
260
+ total_indexed: db[:section_fts].count,
261
+ languages: db[:section_fts].select(:language).distinct.map(:language),
262
+ last_updated: db[:section_fts].select { max(:updated_at) }.first.values.first
263
+ }
264
+ rescue StandardError => e
265
+ @logger.error "Failed to get stats: #{e.message}"
266
+ {}
267
+ end
268
+
269
+ # Remove full-text index for a section
270
+ # @param section_id [Integer] Section ID
271
+ # @return [Boolean] Success status
272
+ def remove_index(section_id)
273
+ raise ArgumentError, 'Section ID cannot be nil' unless section_id
274
+
275
+ deleted = db[:section_fts].where(section_id: section_id).delete
276
+
277
+ if deleted > 0
278
+ @logger.info "Removed full-text index for section #{section_id}"
279
+ true
280
+ else
281
+ @logger.warn "No full-text index found for section #{section_id}"
282
+ false
283
+ end
284
+ rescue Sequel::Error => e
285
+ @logger.error "Failed to remove index for section #{section_id}: #{e.message}"
286
+ false
287
+ rescue StandardError => e
288
+ # Re-raise ArgumentError and other programming errors
289
+ raise e if e.is_a?(ArgumentError)
290
+
291
+ @logger.error "Failed to remove index for section #{section_id}: #{e.message}"
292
+ false
293
+ end
294
+
295
+ # Clean up orphaned indexes
296
+ # @return [Integer] Number of cleaned indexes
297
+ def cleanup_orphaned_indexes
298
+ # Delete rows from section_fts that don't have corresponding source_sections
299
+ count = db[:section_fts]
300
+ .where(
301
+ Sequel[:section_fts][:section_id] => db[:source_sections].select(:id)
302
+ )
303
+ .invert # This negates the WHERE, giving us NOT IN behavior
304
+ .or(section_id: nil) # Also clean up NULL section_id rows
305
+ .delete
306
+
307
+ @logger.info "Cleaned up #{count} orphaned full-text indexes"
308
+ count
309
+ rescue StandardError => e
310
+ @logger.error "Failed to cleanup orphaned indexes: #{e.message}"
311
+ 0
312
+ end
313
+
314
+ private
315
+
316
+ # Detect language from query (simplified implementation)
317
+ def detect_language_from_query(query)
318
+ @query_parser.detect_language(query)
319
+ end
320
+
321
+ # Apply search filters to dataset
322
+ def apply_search_filters(dataset, filters)
323
+ return dataset unless filters && !filters.empty?
324
+
325
+ # Filter by document IDs
326
+ if filters[:document_ids]
327
+ dataset = dataset.where(
328
+ Sequel[:source_sections][:document_id] => filters[:document_ids]
329
+ )
330
+ end
331
+
332
+ # Filter by tags
333
+ if filters[:tag_ids]
334
+ # Use INNER JOIN for better performance when filtering
335
+ dataset = dataset
336
+ .join(:section_tags, section_id: Sequel[:section_fts][:section_id])
337
+ .where(Sequel[:section_tags][:tag_id] => filters[:tag_ids])
338
+ end
339
+
340
+ # Filter by date range
341
+ dataset = dataset.where(Sequel[:source_sections][:created_at] >= filters[:date_from]) if filters[:date_from]
342
+
343
+ dataset = dataset.where(Sequel[:source_sections][:created_at] <= filters[:date_to]) if filters[:date_to]
344
+
345
+ dataset
346
+ end
347
+
348
+ # Get text search configuration
349
+ def get_text_search_config(language)
350
+ config = Models::TextSearchConfig.first(language_code: language.to_s)&.config_name
351
+ return 'pg_catalog.simple' unless config
352
+
353
+ # For development/test environments, always fall back to simple if pg_jieba is not available
354
+ if config == 'jieba'
355
+ begin
356
+ # Test if pg_jieba is available in a separate transaction
357
+ db.fetch("SELECT to_tsvector('jieba', 'test')").first
358
+ return 'jieba'
359
+ rescue StandardError => e
360
+ @logger.warn "pg_jieba extension not available, falling back to simple: #{e.message}"
361
+ return 'pg_catalog.simple'
362
+ end
363
+ end
364
+
365
+ config
366
+ rescue StandardError => e
367
+ @logger.warn "Failed to get text search config for #{language}: #{e.message}, using simple"
368
+ 'pg_catalog.simple'
369
+ end
370
+
371
+ # Set weight for tsvector (helper method)
372
+ def setweight(vector, weight)
373
+ return '' if vector.nil? || vector.to_s.strip.empty?
374
+
375
+ "setweight(#{vector}, '#{weight}')"
376
+ end
377
+
378
+ # Convert text to tsvector (helper method)
379
+ def to_tsvector(config, text)
380
+ "to_tsvector('#{config}', #{escape_quote(text)})"
381
+ end
382
+
383
+ def build_language_configs(primary_language, text)
384
+ primary = primary_language.to_s
385
+ configs = [get_text_search_config(primary)]
386
+
387
+ mix = language_mix_ratios(text)
388
+ return configs if mix[:total].zero?
389
+
390
+ secondary = []
391
+ secondary << 'zh' if primary != 'zh' && mix[:zh] >= 0.2
392
+ secondary << 'ja' if primary != 'ja' && mix[:ja] >= 0.2
393
+ secondary << 'ko' if primary != 'ko' && mix[:ko] >= 0.2
394
+
395
+ secondary.each do |lang|
396
+ configs << get_text_search_config(lang)
397
+ end
398
+
399
+ configs.uniq
400
+ end
401
+
402
+ def build_weighted_vector(configs, text, weight)
403
+ vectors = configs.map { |config| setweight(to_tsvector(config, text), weight) }
404
+ return vectors.first if vectors.length == 1
405
+
406
+ vectors.join(' || ')
407
+ end
408
+
409
+ def language_mix_ratios(text)
410
+ return { zh: 0.0, ja: 0.0, ko: 0.0, total: 0 } if text.nil? || text.empty?
411
+
412
+ ja_count = text.scan(/[\u3040-\u309f\u30a0-\u30ff]/).length
413
+ ko_count = text.scan(/[\uac00-\ud7af]/).length
414
+ zh_count = text.scan(/[\u4e00-\u9fff]/).length
415
+ total = ja_count + ko_count + zh_count
416
+ return { zh: 0.0, ja: 0.0, ko: 0.0, total: 0 } if total.zero?
417
+
418
+ {
419
+ zh: zh_count.to_f / total,
420
+ ja: ja_count.to_f / total,
421
+ ko: ko_count.to_f / total,
422
+ total: total
423
+ }
424
+ end
425
+
426
+ # Escape quotes for SQL
427
+ def escape_quote(text)
428
+ "'#{text.gsub("'", "''")}'"
429
+ end
430
+
431
+ # Combine results using RRF algorithm
432
+ def combine_results_with_rrf(text_results, vector_results, k:)
433
+ scores = {}
434
+
435
+ # Score text results
436
+ text_results.each_with_index do |result, index|
437
+ rank = index + 1
438
+ section_id = result[:section_id]
439
+ scores[section_id] = {
440
+ text_score: 1.0 / (k + rank),
441
+ vector_score: 0,
442
+ data: result
443
+ }
444
+ end
445
+
446
+ # Score vector results
447
+ vector_results.each_with_index do |result, index|
448
+ rank = index + 1
449
+ section_id = result[:section_id]
450
+
451
+ if scores[section_id]
452
+ scores[section_id][:vector_score] = 1.0 / (k + rank)
453
+ else
454
+ scores[section_id] = {
455
+ text_score: 0,
456
+ vector_score: 1.0 / (k + rank),
457
+ data: result
458
+ }
459
+ end
460
+ end
461
+
462
+ # Sort by combined score
463
+ scores.values.sort_by do |score|
464
+ -(score[:text_score] + score[:vector_score])
465
+ end.map { |score| score[:data] }
466
+ end
467
+
468
+ # Format search result
469
+ def format_search_result(row, query)
470
+ {
471
+ section_id: row[:section_id],
472
+ section_title: row[:section_title],
473
+ content: row[:content],
474
+ document_id: row[:document_id],
475
+ language: row[:language],
476
+ rank_score: row[:rank_score] || 0,
477
+ highlight: row[:highlight] || '',
478
+ query: query
479
+ }
480
+ end
481
+ end
482
+ end
483
+ end
@@ -0,0 +1,85 @@
1
+ require 'json'
2
+ require 'tempfile'
3
+ require 'open3'
4
+
5
+ module SmartRAG
6
+ module Core
7
+ # Bridge to Python markitdown library
8
+ class MarkitdownBridge
9
+ class ConversionError < StandardError; end
10
+ class UnsupportedFormatError < StandardError; end
11
+
12
+ def initialize
13
+ @python_cmd = detect_python_cmd
14
+ @python_available = !@python_cmd.nil?
15
+ end
16
+
17
+
18
+ # Convert a file to markdown
19
+ # @param [String] file_path Path to the file to convert
20
+ # @return [String] Converted markdown content
21
+ def convert(file_path)
22
+ raise ConversionError, "Markitdown is not available" unless @python_available
23
+ raise ConversionError, "File not found: #{file_path}" unless File.exist?(file_path)
24
+
25
+ result = call_python_convert(file_path)
26
+
27
+ if result.nil? || result.empty?
28
+ raise ConversionError, "Conversion returned empty result"
29
+ end
30
+
31
+ result
32
+ rescue StandardError => e
33
+ raise ConversionError, "Failed to convert #{file_path}: #{e.message}"
34
+ end
35
+
36
+ # Check if markitdown is available
37
+ # @return [Boolean]
38
+ def available?
39
+ @python_available
40
+ end
41
+
42
+ private
43
+
44
+ def detect_python_cmd
45
+ candidates = %w[python3 python]
46
+
47
+ candidates.find do |cmd|
48
+ system(cmd, "-c", "import markitdown", out: File::NULL, err: File::NULL)
49
+ end
50
+ end
51
+
52
+ def check_python_markitdown
53
+ return false if @python_cmd.nil?
54
+
55
+ system(@python_cmd, "-c", "import markitdown", out: File::NULL, err: File::NULL)
56
+ end
57
+
58
+ def call_python_convert(file_path)
59
+ script = <<~PYTHON
60
+ import sys
61
+ import json
62
+ from markitdown import MarkItDown
63
+ from pathlib import Path
64
+
65
+ try:
66
+ md = MarkItDown()
67
+ result = md.convert("#{file_path}")
68
+ # Return just the text content
69
+ print(result.text_content)
70
+ except Exception as e:
71
+ print(f"ERROR: {str(e)}", file=sys.stderr)
72
+ sys.exit(1)
73
+ PYTHON
74
+
75
+ output, status = Open3.capture2e(@python_cmd, "-c", script)
76
+
77
+ if status.success?
78
+ output
79
+ else
80
+ raise ConversionError, output
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end