rubino-agent 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +115 -0
- data/.rubocop_todo.yml +955 -0
- data/.ruby-version +1 -0
- data/AGENTS.md +97 -0
- data/CHANGELOG.md +344 -0
- data/CONTRIBUTING.md +69 -0
- data/LICENSE +21 -0
- data/README.md +200 -0
- data/Rakefile +8 -0
- data/docs/agents.md +190 -0
- data/docs/api/v1.md +414 -0
- data/docs/architecture.md +177 -0
- data/docs/commands.md +375 -0
- data/docs/configuration.md +590 -0
- data/docs/getting-started.md +143 -0
- data/docs/jobs.md +332 -0
- data/docs/mcp.md +128 -0
- data/docs/memory.md +98 -0
- data/docs/models-and-keys.md +173 -0
- data/docs/oauth-providers.md +145 -0
- data/docs/plugins.md +195 -0
- data/docs/security.md +145 -0
- data/docs/skills.md +322 -0
- data/docs/tools.md +395 -0
- data/docs/troubleshooting.md +73 -0
- data/exe/rubino +9 -0
- data/install.sh +275 -0
- data/lib/rubino/active_skill.rb +50 -0
- data/lib/rubino/agent/agent_registry.rb +120 -0
- data/lib/rubino/agent/backoff_policy.rb +116 -0
- data/lib/rubino/agent/definition.rb +128 -0
- data/lib/rubino/agent/degenerate_recovery.rb +271 -0
- data/lib/rubino/agent/fallback_chain.rb +194 -0
- data/lib/rubino/agent/iteration_budget.rb +50 -0
- data/lib/rubino/agent/loop.rb +617 -0
- data/lib/rubino/agent/model_call_runner.rb +383 -0
- data/lib/rubino/agent/prompts/build.txt +69 -0
- data/lib/rubino/agent/prompts/compaction.txt +20 -0
- data/lib/rubino/agent/prompts/explore.txt +19 -0
- data/lib/rubino/agent/prompts/general.txt +20 -0
- data/lib/rubino/agent/prompts/plan.txt +31 -0
- data/lib/rubino/agent/response_validator.rb +70 -0
- data/lib/rubino/agent/router.rb +65 -0
- data/lib/rubino/agent/runner.rb +195 -0
- data/lib/rubino/agent/tool_executor.rb +402 -0
- data/lib/rubino/agent/truncation_continuation.rb +137 -0
- data/lib/rubino/api/middleware/auth.rb +43 -0
- data/lib/rubino/api/middleware/error_handler.rb +65 -0
- data/lib/rubino/api/middleware/json_parser.rb +100 -0
- data/lib/rubino/api/middleware/observability.rb +59 -0
- data/lib/rubino/api/middleware/rate_limit.rb +136 -0
- data/lib/rubino/api/operations/approvals/decide_operation.rb +49 -0
- data/lib/rubino/api/operations/clarifications/decide_operation.rb +44 -0
- data/lib/rubino/api/operations/cron_jobs/create_operation.rb +46 -0
- data/lib/rubino/api/operations/cron_jobs/delete_operation.rb +36 -0
- data/lib/rubino/api/operations/cron_jobs/list_operation.rb +55 -0
- data/lib/rubino/api/operations/cron_jobs/pause_operation.rb +34 -0
- data/lib/rubino/api/operations/cron_jobs/resume_operation.rb +34 -0
- data/lib/rubino/api/operations/cron_jobs/schedule_validation.rb +30 -0
- data/lib/rubino/api/operations/cron_jobs/show_operation.rb +32 -0
- data/lib/rubino/api/operations/cron_jobs/trigger_operation.rb +38 -0
- data/lib/rubino/api/operations/cron_jobs/update_operation.rb +42 -0
- data/lib/rubino/api/operations/files/read_operation.rb +40 -0
- data/lib/rubino/api/operations/files/upload_operation.rb +175 -0
- data/lib/rubino/api/operations/health_operation.rb +46 -0
- data/lib/rubino/api/operations/memory/delete_operation.rb +32 -0
- data/lib/rubino/api/operations/memory/index_operation.rb +80 -0
- data/lib/rubino/api/operations/memory/stats_operation.rb +28 -0
- data/lib/rubino/api/operations/metrics_operation.rb +18 -0
- data/lib/rubino/api/operations/mode/show_operation.rb +29 -0
- data/lib/rubino/api/operations/mode/update_operation.rb +42 -0
- data/lib/rubino/api/operations/models/list_operation.rb +45 -0
- data/lib/rubino/api/operations/oauth/connections/disconnect_operation.rb +77 -0
- data/lib/rubino/api/operations/oauth/connections/list_operation.rb +36 -0
- data/lib/rubino/api/operations/oauth/providers/callback_operation.rb +82 -0
- data/lib/rubino/api/operations/oauth/providers/connect_operation.rb +44 -0
- data/lib/rubino/api/operations/oauth/providers/list_operation.rb +35 -0
- data/lib/rubino/api/operations/oauth/serializer.rb +21 -0
- data/lib/rubino/api/operations/runs/create_operation.rb +77 -0
- data/lib/rubino/api/operations/runs/events_operation.rb +195 -0
- data/lib/rubino/api/operations/runs/stop_operation.rb +34 -0
- data/lib/rubino/api/operations/sessions/create_operation.rb +46 -0
- data/lib/rubino/api/operations/sessions/delete_operation.rb +33 -0
- data/lib/rubino/api/operations/sessions/index_operation.rb +82 -0
- data/lib/rubino/api/operations/sessions/retry_operation.rb +45 -0
- data/lib/rubino/api/operations/sessions/show_operation.rb +59 -0
- data/lib/rubino/api/operations/sessions/undo_operation.rb +38 -0
- data/lib/rubino/api/operations/skills/list_operation.rb +34 -0
- data/lib/rubino/api/operations/skills/toggle_operation.rb +40 -0
- data/lib/rubino/api/operations/tasks/index_operation.rb +30 -0
- data/lib/rubino/api/operations/tasks/serializer.rb +60 -0
- data/lib/rubino/api/operations/tasks/show_operation.rb +33 -0
- data/lib/rubino/api/operations/tasks/stop_operation.rb +47 -0
- data/lib/rubino/api/request.rb +54 -0
- data/lib/rubino/api/responses.rb +64 -0
- data/lib/rubino/api/router.rb +72 -0
- data/lib/rubino/api/schemas.rb +103 -0
- data/lib/rubino/api/server.rb +102 -0
- data/lib/rubino/api/tls.rb +108 -0
- data/lib/rubino/attachments/classification.rb +16 -0
- data/lib/rubino/attachments/classify.rb +171 -0
- data/lib/rubino/attachments/defang.rb +47 -0
- data/lib/rubino/attachments/policy.rb +36 -0
- data/lib/rubino/attachments/preamble.rb +120 -0
- data/lib/rubino/boot/encryption_key.rb +32 -0
- data/lib/rubino/cli/chat/bang_shell.rb +257 -0
- data/lib/rubino/cli/chat/completion_builder.rb +290 -0
- data/lib/rubino/cli/chat/idle_card_host.rb +69 -0
- data/lib/rubino/cli/chat/image_inbox.rb +168 -0
- data/lib/rubino/cli/chat/session_resolver.rb +176 -0
- data/lib/rubino/cli/chat_command.rb +1674 -0
- data/lib/rubino/cli/commands.rb +250 -0
- data/lib/rubino/cli/config_command.rb +96 -0
- data/lib/rubino/cli/doctor_command.rb +251 -0
- data/lib/rubino/cli/jobs_command.rb +60 -0
- data/lib/rubino/cli/memory_command.rb +135 -0
- data/lib/rubino/cli/onboarding_wizard.rb +207 -0
- data/lib/rubino/cli/server_command.rb +139 -0
- data/lib/rubino/cli/session_command.rb +125 -0
- data/lib/rubino/cli/setup_command.rb +107 -0
- data/lib/rubino/cli/skills_command.rb +85 -0
- data/lib/rubino/cli/tools_command.rb +81 -0
- data/lib/rubino/cli/trust_gate.rb +71 -0
- data/lib/rubino/commands/built_ins.rb +46 -0
- data/lib/rubino/commands/command.rb +116 -0
- data/lib/rubino/commands/executor.rb +550 -0
- data/lib/rubino/commands/handlers/agents.rb +510 -0
- data/lib/rubino/commands/handlers/config.rb +88 -0
- data/lib/rubino/commands/handlers/help.rb +148 -0
- data/lib/rubino/commands/handlers/jobs.rb +71 -0
- data/lib/rubino/commands/handlers/mcp.rb +229 -0
- data/lib/rubino/commands/handlers/memory.rb +200 -0
- data/lib/rubino/commands/handlers/sessions.rb +207 -0
- data/lib/rubino/commands/handlers/skills.rb +195 -0
- data/lib/rubino/commands/handlers/status.rb +211 -0
- data/lib/rubino/commands/loader.rb +90 -0
- data/lib/rubino/config/configuration.rb +455 -0
- data/lib/rubino/config/defaults.rb +569 -0
- data/lib/rubino/config/loader.rb +115 -0
- data/lib/rubino/config/reasoning_prefs.rb +67 -0
- data/lib/rubino/config/writer.rb +72 -0
- data/lib/rubino/context/compressor.rb +149 -0
- data/lib/rubino/context/environment_inspector.rb +176 -0
- data/lib/rubino/context/file_discovery.rb +45 -0
- data/lib/rubino/context/message_boundary.rb +39 -0
- data/lib/rubino/context/prompt_assembler.rb +382 -0
- data/lib/rubino/context/summary_builder.rb +159 -0
- data/lib/rubino/context/token_budget.rb +68 -0
- data/lib/rubino/context/tool_pair_sanitizer.rb +70 -0
- data/lib/rubino/database/connection.rb +77 -0
- data/lib/rubino/database/migrations/001_create_initial_schema.rb +156 -0
- data/lib/rubino/database/migrations/002_create_runs.rb +45 -0
- data/lib/rubino/database/migrations/003_create_skill_states.rb +15 -0
- data/lib/rubino/database/migrations/004_create_cron_jobs.rb +36 -0
- data/lib/rubino/database/migrations/005_create_oauth_connections.rb +27 -0
- data/lib/rubino/database/migrations/006_create_webhook_deliveries.rb +34 -0
- data/lib/rubino/database/migrations/007_create_messages_fts.rb +59 -0
- data/lib/rubino/database/migrations/008_create_memory_facts.rb +75 -0
- data/lib/rubino/database/migrations/009_create_memory_graph.rb +55 -0
- data/lib/rubino/database/migrations/010_add_owner_pid_to_sessions.rb +20 -0
- data/lib/rubino/database/migrator.rb +48 -0
- data/lib/rubino/documents/converters/csv.rb +79 -0
- data/lib/rubino/documents/converters/docx.rb +129 -0
- data/lib/rubino/documents/converters/html.rb +28 -0
- data/lib/rubino/documents/converters/json.rb +35 -0
- data/lib/rubino/documents/converters/pdf.rb +59 -0
- data/lib/rubino/documents/converters/plain.rb +68 -0
- data/lib/rubino/documents/converters/pptx.rb +64 -0
- data/lib/rubino/documents/converters/xlsx.rb +62 -0
- data/lib/rubino/documents/converters/xml.rb +45 -0
- data/lib/rubino/documents/html.rb +71 -0
- data/lib/rubino/documents/registry.rb +68 -0
- data/lib/rubino/documents/table.rb +63 -0
- data/lib/rubino/documents.rb +50 -0
- data/lib/rubino/errors.rb +119 -0
- data/lib/rubino/files/workspace.rb +93 -0
- data/lib/rubino/interaction/cancel_token.rb +43 -0
- data/lib/rubino/interaction/clipboard_image.rb +84 -0
- data/lib/rubino/interaction/event_bus.rb +48 -0
- data/lib/rubino/interaction/events.rb +101 -0
- data/lib/rubino/interaction/image_input.rb +127 -0
- data/lib/rubino/interaction/input_queue.rb +117 -0
- data/lib/rubino/interaction/lifecycle.rb +299 -0
- data/lib/rubino/interaction/probe.rb +65 -0
- data/lib/rubino/interaction/state.rb +56 -0
- data/lib/rubino/jobs/cron_job_repository.rb +75 -0
- data/lib/rubino/jobs/handlers/cleanup_sessions_job.rb +32 -0
- data/lib/rubino/jobs/handlers/compact_session_job.rb +21 -0
- data/lib/rubino/jobs/handlers/distill_skill_job.rb +186 -0
- data/lib/rubino/jobs/handlers/extract_memory_job.rb +37 -0
- data/lib/rubino/jobs/handlers/summarize_session_job.rb +21 -0
- data/lib/rubino/jobs/queue.rb +184 -0
- data/lib/rubino/jobs/registry.rb +45 -0
- data/lib/rubino/jobs/runner.rb +79 -0
- data/lib/rubino/jobs/scheduler.rb +138 -0
- data/lib/rubino/jobs/webhook_delivery.rb +225 -0
- data/lib/rubino/jobs/worker.rb +59 -0
- data/lib/rubino/llm/adapter_factory.rb +47 -0
- data/lib/rubino/llm/adapter_response.rb +65 -0
- data/lib/rubino/llm/auxiliary_client.rb +61 -0
- data/lib/rubino/llm/bedrock_bearer_client.rb +235 -0
- data/lib/rubino/llm/content_builder.rb +55 -0
- data/lib/rubino/llm/credential_check.rb +93 -0
- data/lib/rubino/llm/error_classifier.rb +364 -0
- data/lib/rubino/llm/fake_provider.rb +292 -0
- data/lib/rubino/llm/inline_think_filter.rb +58 -0
- data/lib/rubino/llm/model_catalog.rb +29 -0
- data/lib/rubino/llm/provider_resolver.rb +48 -0
- data/lib/rubino/llm/reasoning_manager.rb +100 -0
- data/lib/rubino/llm/request.rb +56 -0
- data/lib/rubino/llm/ruby_llm_adapter.rb +794 -0
- data/lib/rubino/llm/scenario_loader.rb +68 -0
- data/lib/rubino/llm/scenario_selector.rb +80 -0
- data/lib/rubino/llm/scenarios/agent-creates-cron-failure.yml +29 -0
- data/lib/rubino/llm/scenarios/agent-creates-cron.yml +36 -0
- data/lib/rubino/llm/scenarios/analysis.yml +501 -0
- data/lib/rubino/llm/scenarios/complex-analysis.yml +598 -0
- data/lib/rubino/llm/scenarios/failure.yml +65 -0
- data/lib/rubino/llm/scenarios/happy-path.yml +24 -0
- data/lib/rubino/llm/scenarios/provider-quota-completed.yml +14 -0
- data/lib/rubino/llm/scenarios/wide-table.yml +121 -0
- data/lib/rubino/llm/scenarios/with-approvals.yml +50 -0
- data/lib/rubino/llm/scenarios/with-artifacts.yml +98 -0
- data/lib/rubino/llm/scenarios/with-clarify.yml +32 -0
- data/lib/rubino/llm/scenarios/with-reasoning.yml +175 -0
- data/lib/rubino/llm/scenarios/with-uploads.yml +104 -0
- data/lib/rubino/llm/thinking_support.rb +84 -0
- data/lib/rubino/llm/tool_bridge.rb +89 -0
- data/lib/rubino/logger.rb +99 -0
- data/lib/rubino/mcp/manager.rb +180 -0
- data/lib/rubino/mcp/mcp_tool_wrapper.rb +69 -0
- data/lib/rubino/mcp.rb +57 -0
- data/lib/rubino/memory/backend.rb +104 -0
- data/lib/rubino/memory/backends/default.rb +101 -0
- data/lib/rubino/memory/backends/sqlite.rb +653 -0
- data/lib/rubino/memory/backends.rb +53 -0
- data/lib/rubino/memory/deduplicator.rb +74 -0
- data/lib/rubino/memory/extractor.rb +85 -0
- data/lib/rubino/memory/flusher.rb +31 -0
- data/lib/rubino/memory/retriever.rb +50 -0
- data/lib/rubino/memory/sqlite_extraction_prompt.rb +70 -0
- data/lib/rubino/memory/sqlite_graph.rb +154 -0
- data/lib/rubino/memory/store.rb +228 -0
- data/lib/rubino/memory/threat_scanner.rb +68 -0
- data/lib/rubino/metrics.rb +175 -0
- data/lib/rubino/modes.rb +93 -0
- data/lib/rubino/oauth/connection_repository.rb +95 -0
- data/lib/rubino/oauth/provider/github.rb +75 -0
- data/lib/rubino/oauth/provider/google.rb +59 -0
- data/lib/rubino/oauth/provider.rb +149 -0
- data/lib/rubino/oauth/registry.rb +86 -0
- data/lib/rubino/oauth/token_encryptor.rb +87 -0
- data/lib/rubino/plugins/registry.rb +75 -0
- data/lib/rubino/plugins.rb +86 -0
- data/lib/rubino/run/approval_gate.rb +243 -0
- data/lib/rubino/run/attachment_downloader.rb +166 -0
- data/lib/rubino/run/event_store.rb +74 -0
- data/lib/rubino/run/executor.rb +383 -0
- data/lib/rubino/run/gate_registry.rb +39 -0
- data/lib/rubino/run/recorder.rb +69 -0
- data/lib/rubino/run/repository.rb +118 -0
- data/lib/rubino/run/session_approval_cache.rb +118 -0
- data/lib/rubino/security/allowlist_persister.rb +55 -0
- data/lib/rubino/security/approval_policy.rb +227 -0
- data/lib/rubino/security/command_allowlist.rb +24 -0
- data/lib/rubino/security/dangerous_patterns.rb +118 -0
- data/lib/rubino/security/deny_persister.rb +73 -0
- data/lib/rubino/security/doom_loop_detector.rb +43 -0
- data/lib/rubino/security/hardline_guard.rb +105 -0
- data/lib/rubino/security/pattern_matcher.rb +62 -0
- data/lib/rubino/security/prefix_deriver.rb +124 -0
- data/lib/rubino/security/readonly_commands.rb +211 -0
- data/lib/rubino/session/exporter.rb +101 -0
- data/lib/rubino/session/message.rb +77 -0
- data/lib/rubino/session/repository.rb +295 -0
- data/lib/rubino/session/store.rb +198 -0
- data/lib/rubino/session/summary_store.rb +65 -0
- data/lib/rubino/skills/prompt_index.rb +85 -0
- data/lib/rubino/skills/registry.rb +208 -0
- data/lib/rubino/skills/skill.rb +176 -0
- data/lib/rubino/skills/skill_tool.rb +215 -0
- data/lib/rubino/skills/state_repository.rb +37 -0
- data/lib/rubino/skills/toggle.rb +26 -0
- data/lib/rubino/tools/answer_child_tool.rb +83 -0
- data/lib/rubino/tools/ask_parent_tool.rb +232 -0
- data/lib/rubino/tools/attach_file_tool.rb +120 -0
- data/lib/rubino/tools/background_tasks.rb +520 -0
- data/lib/rubino/tools/base.rb +222 -0
- data/lib/rubino/tools/custom_tool_loader.rb +119 -0
- data/lib/rubino/tools/edit_tool.rb +122 -0
- data/lib/rubino/tools/git_tool.rb +71 -0
- data/lib/rubino/tools/github_tool.rb +233 -0
- data/lib/rubino/tools/glob_tool.rb +69 -0
- data/lib/rubino/tools/grep_tool.rb +206 -0
- data/lib/rubino/tools/memory_tool.rb +184 -0
- data/lib/rubino/tools/multi_edit_tool.rb +110 -0
- data/lib/rubino/tools/patch_tool.rb +260 -0
- data/lib/rubino/tools/probe_tool.rb +175 -0
- data/lib/rubino/tools/question_tool.rb +128 -0
- data/lib/rubino/tools/read_attachment_tool.rb +180 -0
- data/lib/rubino/tools/read_tool.rb +212 -0
- data/lib/rubino/tools/read_tracker.rb +98 -0
- data/lib/rubino/tools/registry.rb +166 -0
- data/lib/rubino/tools/result.rb +113 -0
- data/lib/rubino/tools/ruby_tool.rb +0 -0
- data/lib/rubino/tools/session_search_tool.rb +103 -0
- data/lib/rubino/tools/shell_input_tool.rb +96 -0
- data/lib/rubino/tools/shell_kill_tool.rb +76 -0
- data/lib/rubino/tools/shell_output_tool.rb +72 -0
- data/lib/rubino/tools/shell_registry.rb +158 -0
- data/lib/rubino/tools/shell_tail_tool.rb +118 -0
- data/lib/rubino/tools/shell_tool.rb +330 -0
- data/lib/rubino/tools/steer_tool.rb +118 -0
- data/lib/rubino/tools/subagent_probe.rb +89 -0
- data/lib/rubino/tools/summarize_file_tool.rb +182 -0
- data/lib/rubino/tools/task_result_tool.rb +90 -0
- data/lib/rubino/tools/task_stop_tool.rb +80 -0
- data/lib/rubino/tools/task_tool.rb +622 -0
- data/lib/rubino/tools/test_tool.rb +454 -0
- data/lib/rubino/tools/todo_tool.rb +93 -0
- data/lib/rubino/tools/tool_call_repository.rb +33 -0
- data/lib/rubino/tools/vision_tool.rb +85 -0
- data/lib/rubino/tools/webfetch_tool.rb +153 -0
- data/lib/rubino/tools/websearch_tool.rb +179 -0
- data/lib/rubino/tools/write_tool.rb +61 -0
- data/lib/rubino/trust.rb +88 -0
- data/lib/rubino/ui/api.rb +296 -0
- data/lib/rubino/ui/base.rb +252 -0
- data/lib/rubino/ui/bottom_composer.rb +1599 -0
- data/lib/rubino/ui/cli.rb +1987 -0
- data/lib/rubino/ui/completion_menu.rb +321 -0
- data/lib/rubino/ui/completion_source.rb +284 -0
- data/lib/rubino/ui/escape_reader.rb +169 -0
- data/lib/rubino/ui/indented_io.rb +88 -0
- data/lib/rubino/ui/input_history.rb +108 -0
- data/lib/rubino/ui/live_region.rb +183 -0
- data/lib/rubino/ui/markdown_renderer.rb +506 -0
- data/lib/rubino/ui/notifier.rb +163 -0
- data/lib/rubino/ui/null.rb +195 -0
- data/lib/rubino/ui/paste_store.rb +176 -0
- data/lib/rubino/ui/printer_base.rb +79 -0
- data/lib/rubino/ui/probe_wait_indicator.rb +75 -0
- data/lib/rubino/ui/queued_indicators.rb +66 -0
- data/lib/rubino/ui/status_bar.rb +100 -0
- data/lib/rubino/ui/stdout_proxy.rb +161 -0
- data/lib/rubino/ui/streaming_markdown.rb +186 -0
- data/lib/rubino/ui/subagent_cards.rb +134 -0
- data/lib/rubino/ui/subagent_view.rb +255 -0
- data/lib/rubino/ui.rb +21 -0
- data/lib/rubino/update_check.rb +187 -0
- data/lib/rubino/util/duration.rb +23 -0
- data/lib/rubino/util/hyperlink.rb +105 -0
- data/lib/rubino/util/output.rb +145 -0
- data/lib/rubino/util/secrets_mask.rb +83 -0
- data/lib/rubino/version.rb +5 -0
- data/lib/rubino/workspace.rb +85 -0
- data/lib/rubino-agent.rb +5 -0
- data/lib/rubino.rb +318 -0
- data/mise.toml +2 -0
- data/rubino-agent.gemspec +103 -0
- data/skills/ruby-expert/SKILL.md +67 -0
- data/skills/ruby-expert/references/concurrency.md +357 -0
- data/skills/ruby-expert/references/datetime-and-encoding.md +363 -0
- data/skills/ruby-expert/references/errors-and-types.md +460 -0
- data/skills/ruby-expert/references/gem-authoring.md +459 -0
- data/skills/ruby-expert/references/language-idioms.md +465 -0
- data/skills/ruby-expert/references/metaprogramming.md +339 -0
- data/skills/ruby-expert/references/oo-design.md +553 -0
- data/skills/ruby-expert/references/performance.md +383 -0
- data/skills/ruby-expert/references/rails.md +424 -0
- data/skills/ruby-expert/references/security.md +404 -0
- data/skills/ruby-expert/references/testing.md +473 -0
- data/skills/ruby-expert/references/tooling.md +466 -0
- metadata +856 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# CSV -> a GFM Markdown table (first row = header), via the shared Table
|
|
7
|
+
# emitter. `csv` was removed from Ruby's default gems in 3.4, so we require
|
|
8
|
+
# it defensively: if it isn't present we still parse with a tiny built-in
|
|
9
|
+
# splitter (handles the common quoted-field case) rather than going
|
|
10
|
+
# unavailable -- CSV is too central to drop on a missing stdlib gem.
|
|
11
|
+
class Csv
|
|
12
|
+
def available?
|
|
13
|
+
true
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def accepts?(mime, path)
|
|
17
|
+
m = mime.to_s
|
|
18
|
+
return true if ["text/csv", "application/csv"].include?(m)
|
|
19
|
+
|
|
20
|
+
File.extname(path.to_s).downcase == ".csv"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def convert(path)
|
|
24
|
+
rows = parse(path)
|
|
25
|
+
Table.emit(rows)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def parse(path)
|
|
31
|
+
raw = File.read(path, encoding: "bom|utf-8")
|
|
32
|
+
require "csv"
|
|
33
|
+
::CSV.parse(raw)
|
|
34
|
+
rescue LoadError, ::CSV::MalformedCSVError
|
|
35
|
+
fallback_parse(raw)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Minimal RFC-4180-ish parser for the no-stdlib-csv case: splits on
|
|
39
|
+
# newlines and commas, honouring double-quoted fields with embedded
|
|
40
|
+
# commas/quotes. Good enough for the common spreadsheet export.
|
|
41
|
+
def fallback_parse(raw)
|
|
42
|
+
raw.to_s.each_line.map do |line|
|
|
43
|
+
split_line(line.chomp)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def split_line(line)
|
|
48
|
+
fields = []
|
|
49
|
+
field = +""
|
|
50
|
+
in_quotes = false
|
|
51
|
+
i = 0
|
|
52
|
+
while i < line.length
|
|
53
|
+
ch = line[i]
|
|
54
|
+
if in_quotes
|
|
55
|
+
if ch == '"' && line[i + 1] == '"'
|
|
56
|
+
field << '"'
|
|
57
|
+
i += 1
|
|
58
|
+
elsif ch == '"'
|
|
59
|
+
in_quotes = false
|
|
60
|
+
else
|
|
61
|
+
field << ch
|
|
62
|
+
end
|
|
63
|
+
elsif ch == '"'
|
|
64
|
+
in_quotes = true
|
|
65
|
+
elsif ch == ","
|
|
66
|
+
fields << field
|
|
67
|
+
field = +""
|
|
68
|
+
else
|
|
69
|
+
field << ch
|
|
70
|
+
end
|
|
71
|
+
i += 1
|
|
72
|
+
end
|
|
73
|
+
fields << field
|
|
74
|
+
fields
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# DOCX -> Markdown via the `docx` gem (MIT, OPTIONAL). markitdown gets this
|
|
7
|
+
# "for free" by going docx->HTML (mammoth) then through its HTML core; the
|
|
8
|
+
# Ruby `docx` gem instead hands us paragraphs (with a style name) and
|
|
9
|
+
# tables, so we map the structure directly:
|
|
10
|
+
# "Heading 1".."Heading 6" -> "#".."######"
|
|
11
|
+
# "Title" -> "#"
|
|
12
|
+
# list paragraphs -> "- " / "1. "
|
|
13
|
+
# bold/italic runs -> "**"/"*"
|
|
14
|
+
# tables -> GFM table via the shared Table emitter
|
|
15
|
+
# Known limitations (documented in specs): embedded images are dropped,
|
|
16
|
+
# nested tables are flattened, and run-level formatting beyond bold/italic
|
|
17
|
+
# is not preserved.
|
|
18
|
+
class Docx
|
|
19
|
+
MIMES = %w[
|
|
20
|
+
application/vnd.openxmlformats-officedocument.wordprocessingml.document
|
|
21
|
+
].freeze
|
|
22
|
+
|
|
23
|
+
def available?
|
|
24
|
+
require "docx"
|
|
25
|
+
true
|
|
26
|
+
rescue LoadError
|
|
27
|
+
false
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def accepts?(mime, path)
|
|
31
|
+
return true if MIMES.include?(mime.to_s)
|
|
32
|
+
|
|
33
|
+
File.extname(path.to_s).downcase == ".docx"
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def convert(path)
|
|
37
|
+
require "docx"
|
|
38
|
+
doc = ::Docx::Document.open(path)
|
|
39
|
+
blocks = []
|
|
40
|
+
# Iterate document order when the gem exposes it; otherwise paragraphs
|
|
41
|
+
# then tables (best-effort -- the gem version dictates what's available).
|
|
42
|
+
if doc.respond_to?(:each_paragraph)
|
|
43
|
+
doc.each_paragraph { |p| blocks << paragraph_markdown(p) }
|
|
44
|
+
else
|
|
45
|
+
doc.paragraphs.each { |p| blocks << paragraph_markdown(p) }
|
|
46
|
+
end
|
|
47
|
+
doc.tables.each { |t| blocks << table_markdown(t) } if doc.respond_to?(:tables)
|
|
48
|
+
blocks.compact.reject(&:empty?).join("\n\n")
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def paragraph_markdown(para)
|
|
54
|
+
text = inline_text(para)
|
|
55
|
+
return "" if text.strip.empty?
|
|
56
|
+
|
|
57
|
+
case paragraph_style(para)
|
|
58
|
+
when /\AHeading\s*([1-6])\z/i
|
|
59
|
+
"#{"#" * Regexp.last_match(1).to_i} #{text.strip}"
|
|
60
|
+
when /\ATitle\z/i
|
|
61
|
+
"# #{text.strip}"
|
|
62
|
+
when /\ASubtitle\z/i
|
|
63
|
+
"## #{text.strip}"
|
|
64
|
+
when /List|Bullet/i
|
|
65
|
+
"- #{text.strip}"
|
|
66
|
+
else
|
|
67
|
+
text.strip
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# The gem maps the style id to a human name via styles.xml; both id and
|
|
72
|
+
# name vary by authoring tool ("Heading1" / "heading 1"), and the gem
|
|
73
|
+
# raises on a malformed paragraph, so guard and normalise to a single
|
|
74
|
+
# spaced form the case/when above matches case-insensitively.
|
|
75
|
+
def paragraph_style(para)
|
|
76
|
+
return "" unless para.respond_to?(:style)
|
|
77
|
+
|
|
78
|
+
para.style.to_s
|
|
79
|
+
rescue StandardError
|
|
80
|
+
""
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Joins a paragraph's text runs, wrapping bold/italic runs in Markdown
|
|
84
|
+
# emphasis when the gem exposes run-level formatting.
|
|
85
|
+
def inline_text(para)
|
|
86
|
+
unless para.respond_to?(:each_text_run)
|
|
87
|
+
return para.respond_to?(:text) ? para.text.to_s : para.to_s
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
out = +""
|
|
91
|
+
para.each_text_run do |run|
|
|
92
|
+
t = run.respond_to?(:text) ? run.text.to_s : run.to_s
|
|
93
|
+
next if t.empty?
|
|
94
|
+
|
|
95
|
+
t = "**#{t}**" if bold?(run)
|
|
96
|
+
t = "*#{t}*" if italic?(run) && !bold?(run)
|
|
97
|
+
out << t
|
|
98
|
+
end
|
|
99
|
+
out
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# The gem names these `bolded?`/`italicized?` (older forks use
|
|
103
|
+
# `bold?`/`italic?`); probe both so run emphasis survives a version bump.
|
|
104
|
+
def bold?(run)
|
|
105
|
+
(run.respond_to?(:bolded?) && run.bolded?) ||
|
|
106
|
+
(run.respond_to?(:bold?) && run.bold?)
|
|
107
|
+
rescue StandardError
|
|
108
|
+
false
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def italic?(run)
|
|
112
|
+
(run.respond_to?(:italicized?) && run.italicized?) ||
|
|
113
|
+
(run.respond_to?(:italic?) && run.italic?)
|
|
114
|
+
rescue StandardError
|
|
115
|
+
false
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def table_markdown(table)
|
|
119
|
+
rows = table.rows.map do |row|
|
|
120
|
+
row.cells.map { |cell| cell.respond_to?(:text) ? cell.text.to_s : cell.to_s }
|
|
121
|
+
end
|
|
122
|
+
Table.emit(rows)
|
|
123
|
+
rescue StandardError
|
|
124
|
+
""
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# HTML / XHTML -> Markdown via the shared HTML core (Documents::Html).
|
|
7
|
+
# Thin by design: read the file, hand the bytes to the core. This is the
|
|
8
|
+
# engine the other shaped-as-HTML converters reuse.
|
|
9
|
+
class Html
|
|
10
|
+
def available?
|
|
11
|
+
true
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def accepts?(mime, path)
|
|
15
|
+
m = mime.to_s
|
|
16
|
+
return true if ["text/html", "application/xhtml+xml"].include?(m)
|
|
17
|
+
|
|
18
|
+
%w[.html .htm .xhtml].include?(File.extname(path.to_s).downcase)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def convert(path)
|
|
22
|
+
raw = File.read(path, encoding: "bom|utf-8")
|
|
23
|
+
Documents::Html.to_markdown(raw)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Rubino
|
|
6
|
+
module Documents
|
|
7
|
+
module Converters
|
|
8
|
+
# JSON -> Markdown: pretty-printed inside a ```json fence (same as
|
|
9
|
+
# markitdown). On a parse error we fence the raw bytes verbatim rather than
|
|
10
|
+
# failing -- the model still gets the content, just unprettified.
|
|
11
|
+
class Json
|
|
12
|
+
def available?
|
|
13
|
+
true
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def accepts?(mime, path)
|
|
17
|
+
m = mime.to_s
|
|
18
|
+
return true if m == "application/json" || m.end_with?("+json")
|
|
19
|
+
|
|
20
|
+
File.extname(path.to_s).downcase == ".json"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def convert(path)
|
|
24
|
+
raw = File.read(path, encoding: "bom|utf-8")
|
|
25
|
+
pretty = begin
|
|
26
|
+
::JSON.pretty_generate(::JSON.parse(raw))
|
|
27
|
+
rescue ::JSON::ParserError
|
|
28
|
+
raw.strip
|
|
29
|
+
end
|
|
30
|
+
"```json\n#{pretty}\n```\n"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# PDF -> Markdown via `pdf-reader` (pure Ruby, MIT, OPTIONAL). Text-first:
|
|
7
|
+
# each page's text is extracted and pages are joined with a blank line.
|
|
8
|
+
# Honest limits (documented in specs):
|
|
9
|
+
# - No OCR. A scanned / image-only PDF yields no extractable text; we
|
|
10
|
+
# return a clear "no extractable text (scanned?)" note, not a crash.
|
|
11
|
+
# - Multi-column / complex layout: pdf-reader gives reading order by
|
|
12
|
+
# token position, which is imperfect for multi-column pages -- word
|
|
13
|
+
# order may differ from the visual layout. Best-effort, not exact.
|
|
14
|
+
# - The token-position table heuristic markitdown does with pdfplumber is
|
|
15
|
+
# intentionally deferred; it is the hard, low-ceiling part.
|
|
16
|
+
class Pdf
|
|
17
|
+
MIMES = %w[application/pdf].freeze
|
|
18
|
+
|
|
19
|
+
def available?
|
|
20
|
+
require "pdf/reader"
|
|
21
|
+
true
|
|
22
|
+
rescue LoadError
|
|
23
|
+
false
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def accepts?(mime, path)
|
|
27
|
+
return true if MIMES.include?(mime.to_s)
|
|
28
|
+
|
|
29
|
+
File.extname(path.to_s).downcase == ".pdf"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def convert(path)
|
|
33
|
+
require "pdf/reader"
|
|
34
|
+
reader = PDF::Reader.new(path)
|
|
35
|
+
pages = reader.pages.map { |page| page_text(page) }
|
|
36
|
+
text = pages.reject(&:empty?).join("\n\n")
|
|
37
|
+
return scanned_note if text.strip.empty?
|
|
38
|
+
|
|
39
|
+
text
|
|
40
|
+
rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError
|
|
41
|
+
scanned_note
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def page_text(page)
|
|
47
|
+
page.text.to_s.gsub(/[ \t]+\n/, "\n").strip
|
|
48
|
+
rescue StandardError
|
|
49
|
+
""
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def scanned_note
|
|
53
|
+
"_(No extractable text found in this PDF -- it may be scanned or " \
|
|
54
|
+
"image-only. No OCR is performed in-process.)_"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# Plain text / source code -> Markdown. The last-resort converter: it
|
|
7
|
+
# accepts any text/* MIME (and is the registry's final fallback). Markdown
|
|
8
|
+
# passes through unchanged; other source files are wrapped in a fenced code
|
|
9
|
+
# block tagged with the language inferred from the extension, so the model
|
|
10
|
+
# sees the code as code, not prose. Encoding is normalised to UTF-8.
|
|
11
|
+
class Plain
|
|
12
|
+
# Extension -> fenced-code language hint. Markdown/plain text are NOT
|
|
13
|
+
# fenced (they pass through). Anything else with a known mapping fences.
|
|
14
|
+
LANGS = {
|
|
15
|
+
".rb" => "ruby", ".py" => "python", ".js" => "javascript",
|
|
16
|
+
".ts" => "typescript", ".jsx" => "jsx", ".tsx" => "tsx",
|
|
17
|
+
".go" => "go", ".rs" => "rust", ".java" => "java", ".c" => "c",
|
|
18
|
+
".h" => "c", ".cpp" => "cpp", ".cc" => "cpp", ".hpp" => "cpp",
|
|
19
|
+
".cs" => "csharp", ".php" => "php", ".rb_" => "ruby",
|
|
20
|
+
".sh" => "bash", ".bash" => "bash", ".zsh" => "bash",
|
|
21
|
+
".sql" => "sql", ".yml" => "yaml", ".yaml" => "yaml",
|
|
22
|
+
".toml" => "toml", ".ini" => "ini", ".css" => "css",
|
|
23
|
+
".scss" => "scss", ".swift" => "swift", ".kt" => "kotlin",
|
|
24
|
+
".lua" => "lua", ".pl" => "perl", ".r" => "r"
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
MARKDOWN_EXTS = %w[.md .markdown .mdown .mkd].freeze
|
|
28
|
+
|
|
29
|
+
def available?
|
|
30
|
+
true
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Accepts anything textual: text/* MIME, the textual application/* types,
|
|
34
|
+
# or -- as the final fallback -- a file with a known code/markdown
|
|
35
|
+
# extension even when MIME is unknown.
|
|
36
|
+
def accepts?(mime, path)
|
|
37
|
+
m = mime.to_s
|
|
38
|
+
return true if m.start_with?("text/")
|
|
39
|
+
return true if textual_application?(m)
|
|
40
|
+
|
|
41
|
+
ext = File.extname(path.to_s).downcase
|
|
42
|
+
MARKDOWN_EXTS.include?(ext) || LANGS.key?(ext)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def convert(path)
|
|
46
|
+
raw = File.binread(path).to_s.dup.force_encoding("UTF-8")
|
|
47
|
+
raw = raw.scrub("�") unless raw.valid_encoding?
|
|
48
|
+
ext = File.extname(path.to_s).downcase
|
|
49
|
+
|
|
50
|
+
return raw if MARKDOWN_EXTS.include?(ext)
|
|
51
|
+
|
|
52
|
+
lang = LANGS[ext]
|
|
53
|
+
return raw if lang.nil? # unknown text: pass through as-is
|
|
54
|
+
|
|
55
|
+
"```#{lang}\n#{raw.chomp}\n```\n"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def textual_application?(mime)
|
|
61
|
+
mime == "application/json" || mime == "application/xml" ||
|
|
62
|
+
mime == "application/javascript" || mime == "application/x-yaml" ||
|
|
63
|
+
mime.end_with?("+json") || mime.end_with?("+xml")
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# PPTX -> Markdown via the `ruby_powerpoint` gem (MIT, OPTIONAL). Each
|
|
7
|
+
# slide becomes a `## Slide N` heading; the slide's text frames become
|
|
8
|
+
# paragraphs/bullets and speaker notes go under a `>` block quote. The gem
|
|
9
|
+
# gives us text per slide (and notes); it does not preserve shape geometry,
|
|
10
|
+
# so we emit text in document order -- good enough for an LLM to read.
|
|
11
|
+
class Pptx
|
|
12
|
+
MIMES = %w[
|
|
13
|
+
application/vnd.openxmlformats-officedocument.presentationml.presentation
|
|
14
|
+
].freeze
|
|
15
|
+
|
|
16
|
+
def available?
|
|
17
|
+
require "ruby_powerpoint"
|
|
18
|
+
true
|
|
19
|
+
rescue LoadError
|
|
20
|
+
false
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def accepts?(mime, path)
|
|
24
|
+
return true if MIMES.include?(mime.to_s)
|
|
25
|
+
|
|
26
|
+
File.extname(path.to_s).downcase == ".pptx"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def convert(path)
|
|
30
|
+
require "ruby_powerpoint"
|
|
31
|
+
ppt = RubyPowerpoint::Presentation.new(path)
|
|
32
|
+
parts = ppt.slides.each_with_index.map { |slide, i| slide_markdown(slide, i + 1) }
|
|
33
|
+
parts.compact.join("\n\n")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def slide_markdown(slide, number)
|
|
39
|
+
lines = ["## Slide #{number}"]
|
|
40
|
+
|
|
41
|
+
title = slide.respond_to?(:title) ? slide.title.to_s.strip : ""
|
|
42
|
+
lines << "### #{title}" unless title.empty?
|
|
43
|
+
|
|
44
|
+
texts = Array(slide.respond_to?(:text) ? slide.text : nil)
|
|
45
|
+
.flatten
|
|
46
|
+
.map { |t| t.to_s.strip }
|
|
47
|
+
.reject { |t| t.empty? || t == title }
|
|
48
|
+
texts.each { |t| lines << "- #{t}" }
|
|
49
|
+
|
|
50
|
+
if slide.respond_to?(:notes)
|
|
51
|
+
notes = slide.notes.to_s.strip
|
|
52
|
+
lines << "\n> Notes: #{notes}" unless notes.empty?
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
return nil if lines.length == 1 # only the "## Slide N" header
|
|
56
|
+
|
|
57
|
+
lines.join("\n")
|
|
58
|
+
rescue StandardError
|
|
59
|
+
nil
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# XLSX (and ODS/legacy XLS where roo supports them) -> Markdown. Each
|
|
7
|
+
# sheet becomes a `## SheetName` heading followed by a GFM table emitted by
|
|
8
|
+
# the shared Table emitter. The `roo` gem (MIT) is OPTIONAL: #available?
|
|
9
|
+
# reports false when it can't be required, so the registry never offers
|
|
10
|
+
# this converter on an install without roo -- the caller then falls back to
|
|
11
|
+
# the shell-extraction hint.
|
|
12
|
+
class Xlsx
|
|
13
|
+
MIMES = %w[
|
|
14
|
+
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
|
|
15
|
+
application/vnd.oasis.opendocument.spreadsheet
|
|
16
|
+
application/vnd.ms-excel
|
|
17
|
+
].freeze
|
|
18
|
+
EXTS = %w[.xlsx .ods .xls].freeze
|
|
19
|
+
|
|
20
|
+
def available?
|
|
21
|
+
require "roo"
|
|
22
|
+
true
|
|
23
|
+
rescue LoadError
|
|
24
|
+
false
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def accepts?(mime, path)
|
|
28
|
+
return true if MIMES.include?(mime.to_s)
|
|
29
|
+
|
|
30
|
+
EXTS.include?(File.extname(path.to_s).downcase)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def convert(path)
|
|
34
|
+
require "roo"
|
|
35
|
+
book = Roo::Spreadsheet.open(path)
|
|
36
|
+
parts = book.sheets.map { |name| sheet_markdown(book, name) }.compact
|
|
37
|
+
parts.join("\n\n")
|
|
38
|
+
ensure
|
|
39
|
+
book&.close if defined?(book) && book.respond_to?(:close)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def sheet_markdown(book, name)
|
|
45
|
+
sheet = book.sheet(name)
|
|
46
|
+
rows = []
|
|
47
|
+
if sheet.first_row && sheet.last_row
|
|
48
|
+
(sheet.first_row..sheet.last_row).each do |r|
|
|
49
|
+
rows << (sheet.first_column..sheet.last_column).map { |c| sheet.cell(r, c) }
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
table = Table.emit(rows)
|
|
53
|
+
return nil if table.empty?
|
|
54
|
+
|
|
55
|
+
"## #{name}\n\n#{table}"
|
|
56
|
+
rescue StandardError
|
|
57
|
+
nil
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Rubino
|
|
4
|
+
module Documents
|
|
5
|
+
module Converters
|
|
6
|
+
# XML -> Markdown: pretty-printed inside a ```xml fence (markitdown does
|
|
7
|
+
# the same for generic XML). Uses stdlib REXML, which ships with Ruby; if
|
|
8
|
+
# pretty-printing fails we fence the raw bytes. SVG is deliberately NOT
|
|
9
|
+
# handled here -- Classify routes SVG to :text, never :document.
|
|
10
|
+
class Xml
|
|
11
|
+
def available?
|
|
12
|
+
true
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def accepts?(mime, path)
|
|
16
|
+
m = mime.to_s
|
|
17
|
+
return false if m == "image/svg+xml"
|
|
18
|
+
return true if m == "application/xml" || m == "text/xml" || m.end_with?("+xml")
|
|
19
|
+
|
|
20
|
+
File.extname(path.to_s).downcase == ".xml"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def convert(path)
|
|
24
|
+
raw = File.read(path, encoding: "bom|utf-8")
|
|
25
|
+
pretty = pretty_print(raw) || raw.strip
|
|
26
|
+
"```xml\n#{pretty}\n```\n"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def pretty_print(raw)
|
|
32
|
+
require "rexml/document"
|
|
33
|
+
doc = REXML::Document.new(raw)
|
|
34
|
+
out = +""
|
|
35
|
+
formatter = REXML::Formatters::Pretty.new(2)
|
|
36
|
+
formatter.compact = true
|
|
37
|
+
formatter.write(doc, out)
|
|
38
|
+
out.strip.empty? ? nil : out.strip
|
|
39
|
+
rescue LoadError, StandardError
|
|
40
|
+
nil
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "kramdown"
|
|
4
|
+
|
|
5
|
+
module Rubino
|
|
6
|
+
module Documents
|
|
7
|
+
# The ONE HTML->Markdown core (markitdown's `HtmlConverter` / `markdownify`
|
|
8
|
+
# equivalent). Every converter that can shape its content as HTML (the html
|
|
9
|
+
# converter itself, and docx via a paragraphs->HTML step) feeds this. Built
|
|
10
|
+
# on kramdown, which is ALREADY a rubino dependency, so no new lib is added.
|
|
11
|
+
#
|
|
12
|
+
# kramdown parses HTML and emits Markdown but defaults to reference-style
|
|
13
|
+
# links ([text][1] + a [1]: url footer). LLMs read inline links more
|
|
14
|
+
# naturally, so we post-process the reference definitions back inline. We
|
|
15
|
+
# also strip non-content elements (script/style) before conversion.
|
|
16
|
+
module Html
|
|
17
|
+
module_function
|
|
18
|
+
|
|
19
|
+
# Converts an HTML string to Markdown. Returns "" on failure rather than
|
|
20
|
+
# raising -- the caller (to_markdown) treats empty as nil.
|
|
21
|
+
def to_markdown(html)
|
|
22
|
+
return "" if html.nil? || html.to_s.strip.empty?
|
|
23
|
+
|
|
24
|
+
cleaned = strip_noise(html.to_s)
|
|
25
|
+
md = Kramdown::Document.new(
|
|
26
|
+
cleaned,
|
|
27
|
+
input: "html",
|
|
28
|
+
html_to_native: true
|
|
29
|
+
).to_kramdown
|
|
30
|
+
inline_reference_links(md).strip
|
|
31
|
+
rescue StandardError
|
|
32
|
+
""
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Removes script/style/head blocks (their text is not document content)
|
|
36
|
+
# and the html/body document-wrapper tags, which kramdown otherwise leaves
|
|
37
|
+
# as literal `<html>...</html>` lines around the converted body. What's
|
|
38
|
+
# left is the inner content kramdown shapes into Markdown.
|
|
39
|
+
def strip_noise(html)
|
|
40
|
+
html
|
|
41
|
+
.gsub(%r{<script\b[^>]*>.*?</script>}mi, "")
|
|
42
|
+
.gsub(%r{<style\b[^>]*>.*?</style>}mi, "")
|
|
43
|
+
.gsub(%r{<head\b[^>]*>.*?</head>}mi, "")
|
|
44
|
+
.gsub(/<!--.*?-->/m, "")
|
|
45
|
+
.gsub(%r{</?(?:html|body|!doctype)\b[^>]*>}mi, "")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Rewrites kramdown's reference-style links/images back to inline form:
|
|
49
|
+
# [text][1] ... [1]: http://x -> [text](http://x)
|
|
50
|
+
# Leaves the body untouched when there are no reference definitions.
|
|
51
|
+
def inline_reference_links(markdown)
|
|
52
|
+
defs = {}
|
|
53
|
+
markdown.each_line do |line|
|
|
54
|
+
m = line.match(/^\s*\[([^\]]+)\]:\s+(\S+)(?:\s+"[^"]*")?\s*$/)
|
|
55
|
+
defs[m[1]] = m[2] if m
|
|
56
|
+
end
|
|
57
|
+
return markdown if defs.empty?
|
|
58
|
+
|
|
59
|
+
body = markdown.gsub(/(!?)\[([^\]]*)\]\[([^\]]+)\]/) do
|
|
60
|
+
bang = Regexp.last_match(1)
|
|
61
|
+
text = Regexp.last_match(2)
|
|
62
|
+
ref = Regexp.last_match(3)
|
|
63
|
+
url = defs[ref.empty? ? text : ref]
|
|
64
|
+
url ? "#{bang}[#{text}](#{url})" : Regexp.last_match(0)
|
|
65
|
+
end
|
|
66
|
+
# Drop the now-inlined reference-definition lines.
|
|
67
|
+
body.each_line.grep_v(/^\s*\[[^\]]+\]:\s+\S+/).join
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|