shoko 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.bundle/config +4 -0
- data/.bundle/config.bak +3 -0
- data/.rspec_status +42 -0
- data/.rubocop.yml +124 -0
- data/Gemfile +19 -0
- data/LICENSE +21 -0
- data/README.md +82 -0
- data/Rakefile +29 -0
- data/bin/start +15 -0
- data/lib/shoko/adapters/book_sources/document_service.rb +201 -0
- data/lib/shoko/adapters/book_sources/download_service.rb +95 -0
- data/lib/shoko/adapters/book_sources/epub/epub_resource_loader.rb +137 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/html_processor.rb +151 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/metadata_extractor.rb +53 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/entry_reader.rb +77 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/metadata_extractor.rb +67 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_context.rb +86 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_document_index.rb +75 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_document_scanner.rb +47 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_extractor.rb +46 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_label_resolver.rb +83 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_list_item.rb +55 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_result.rb +8 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_selector.rb +100 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_source_locator.rb +93 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_traversal.rb +103 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf/navigation_walker.rb +56 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/opf_processor.rb +102 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/xhtml_content_parser.rb +661 -0
- data/lib/shoko/adapters/book_sources/epub/parsers/xml_text_normalizer.rb +41 -0
- data/lib/shoko/adapters/book_sources/epub_document.rb +253 -0
- data/lib/shoko/adapters/book_sources/epub_finder/directory_scanner.rb +134 -0
- data/lib/shoko/adapters/book_sources/epub_finder/scanner_context.rb +28 -0
- data/lib/shoko/adapters/book_sources/epub_finder.rb +161 -0
- data/lib/shoko/adapters/book_sources/epub_importer.rb +268 -0
- data/lib/shoko/adapters/book_sources/gutendex_client.rb +150 -0
- data/lib/shoko/adapters/book_sources/library_scanner.rb +93 -0
- data/lib/shoko/adapters/book_sources/source_fingerprint.rb +57 -0
- data/lib/shoko/adapters/input/annotations/mouse_handler.rb +84 -0
- data/lib/shoko/adapters/input/command_bridge.rb +148 -0
- data/lib/shoko/adapters/input/command_factory.rb +255 -0
- data/lib/shoko/adapters/input/commands.rb +60 -0
- data/lib/shoko/adapters/input/dispatcher.rb +69 -0
- data/lib/shoko/adapters/input/input_controller.rb +250 -0
- data/lib/shoko/adapters/input/key_definitions.rb +108 -0
- data/lib/shoko/adapters/input/validators/file_path_validator.rb +81 -0
- data/lib/shoko/adapters/input/validators/terminal_size_validator.rb +76 -0
- data/lib/shoko/adapters/monitoring/logger.rb +150 -0
- data/lib/shoko/adapters/monitoring/perf_tracer.rb +183 -0
- data/lib/shoko/adapters/monitoring/performance_monitor.rb +110 -0
- data/lib/shoko/adapters/output/clipboard/clipboard_service.rb +125 -0
- data/lib/shoko/adapters/output/formatting/formatting_service/line_assembler/image_builder.rb +149 -0
- data/lib/shoko/adapters/output/formatting/formatting_service/line_assembler/text_wrapper.rb +149 -0
- data/lib/shoko/adapters/output/formatting/formatting_service/line_assembler/tokenizer.rb +91 -0
- data/lib/shoko/adapters/output/formatting/formatting_service/line_assembler.rb +144 -0
- data/lib/shoko/adapters/output/formatting/formatting_service/plain_lines_builder.rb +54 -0
- data/lib/shoko/adapters/output/formatting/formatting_service.rb +247 -0
- data/lib/shoko/adapters/output/formatting/wrapping_service.rb +228 -0
- data/lib/shoko/adapters/output/instrumentation_service.rb +52 -0
- data/lib/shoko/adapters/output/kitty/image_transcoder.rb +71 -0
- data/lib/shoko/adapters/output/kitty/kitty_graphics.rb +114 -0
- data/lib/shoko/adapters/output/kitty/kitty_image_renderer.rb +239 -0
- data/lib/shoko/adapters/output/kitty/kitty_unicode_placeholders.rb +139 -0
- data/lib/shoko/adapters/output/kitty/kitty_unicode_placeholders_diacritic_codepoints.txt +26 -0
- data/lib/shoko/adapters/output/notification_service.rb +58 -0
- data/lib/shoko/adapters/output/render_registry.rb +45 -0
- data/lib/shoko/adapters/output/rendering/models/line_geometry.rb +60 -0
- data/lib/shoko/adapters/output/rendering/models/page_rendering_context.rb +22 -0
- data/lib/shoko/adapters/output/rendering/models/render_params.rb +28 -0
- data/lib/shoko/adapters/output/rendering/models/rendering_context.rb +58 -0
- data/lib/shoko/adapters/output/terminal/buffer.rb +275 -0
- data/lib/shoko/adapters/output/terminal/constants/terminal_defaults.rb +11 -0
- data/lib/shoko/adapters/output/terminal/input/decoder.rb +347 -0
- data/lib/shoko/adapters/output/terminal/input.rb +161 -0
- data/lib/shoko/adapters/output/terminal/output.rb +105 -0
- data/lib/shoko/adapters/output/terminal/terminal.rb +167 -0
- data/lib/shoko/adapters/output/terminal/terminal_sanitizer.rb +243 -0
- data/lib/shoko/adapters/output/terminal/terminal_service.rb +138 -0
- data/lib/shoko/adapters/output/terminal/text_metrics.rb +273 -0
- data/lib/shoko/adapters/output/ui/builders/page_setup_builder.rb +47 -0
- data/lib/shoko/adapters/output/ui/components/annotation_editor_overlay/footer_renderer.rb +80 -0
- data/lib/shoko/adapters/output/ui/components/annotation_editor_overlay/geometry.rb +61 -0
- data/lib/shoko/adapters/output/ui/components/annotation_editor_overlay/note_renderer.rb +86 -0
- data/lib/shoko/adapters/output/ui/components/annotation_editor_overlay_component.rb +234 -0
- data/lib/shoko/adapters/output/ui/components/annotations_overlay/list_renderer.rb +142 -0
- data/lib/shoko/adapters/output/ui/components/annotations_overlay_component.rb +185 -0
- data/lib/shoko/adapters/output/ui/components/base_component.rb +110 -0
- data/lib/shoko/adapters/output/ui/components/component_interface.rb +80 -0
- data/lib/shoko/adapters/output/ui/components/content_component.rb +61 -0
- data/lib/shoko/adapters/output/ui/components/enhanced_popup_menu.rb +191 -0
- data/lib/shoko/adapters/output/ui/components/footer_component.rb +120 -0
- data/lib/shoko/adapters/output/ui/components/header_component.rb +46 -0
- data/lib/shoko/adapters/output/ui/components/layouts/horizontal.rb +63 -0
- data/lib/shoko/adapters/output/ui/components/layouts/vertical.rb +73 -0
- data/lib/shoko/adapters/output/ui/components/main_menu_component.rb +103 -0
- data/lib/shoko/adapters/output/ui/components/reading/base_view_renderer.rb +199 -0
- data/lib/shoko/adapters/output/ui/components/reading/config_helpers.rb +42 -0
- data/lib/shoko/adapters/output/ui/components/reading/help_renderer.rb +62 -0
- data/lib/shoko/adapters/output/ui/components/reading/inline_segment_highlighter.rb +144 -0
- data/lib/shoko/adapters/output/ui/components/reading/kitty_image_line_renderer.rb +262 -0
- data/lib/shoko/adapters/output/ui/components/reading/line_content_composer.rb +114 -0
- data/lib/shoko/adapters/output/ui/components/reading/line_drawer.rb +87 -0
- data/lib/shoko/adapters/output/ui/components/reading/line_geometry_builder.rb +41 -0
- data/lib/shoko/adapters/output/ui/components/reading/rendered_lines_recorder.rb +64 -0
- data/lib/shoko/adapters/output/ui/components/reading/single_view_renderer.rb +156 -0
- data/lib/shoko/adapters/output/ui/components/reading/split_view_renderer.rb +221 -0
- data/lib/shoko/adapters/output/ui/components/reading/view_renderer_factory.rb +20 -0
- data/lib/shoko/adapters/output/ui/components/reading/wrapped_lines_fetcher.rb +139 -0
- data/lib/shoko/adapters/output/ui/components/rect.rb +15 -0
- data/lib/shoko/adapters/output/ui/components/render_style.rb +84 -0
- data/lib/shoko/adapters/output/ui/components/screen_component.rb +24 -0
- data/lib/shoko/adapters/output/ui/components/screens/annotation_detail_screen_component.rb +175 -0
- data/lib/shoko/adapters/output/ui/components/screens/annotation_edit_screen_component.rb +221 -0
- data/lib/shoko/adapters/output/ui/components/screens/annotation_editor_screen_component.rb +205 -0
- data/lib/shoko/adapters/output/ui/components/screens/annotation_rendering_helpers.rb +190 -0
- data/lib/shoko/adapters/output/ui/components/screens/annotations_screen_component.rb +266 -0
- data/lib/shoko/adapters/output/ui/components/screens/base_screen_component.rb +49 -0
- data/lib/shoko/adapters/output/ui/components/screens/browse_screen_component.rb +319 -0
- data/lib/shoko/adapters/output/ui/components/screens/download_books_screen_component.rb +340 -0
- data/lib/shoko/adapters/output/ui/components/screens/library_screen_component.rb +205 -0
- data/lib/shoko/adapters/output/ui/components/screens/loading_overlay_component.rb +49 -0
- data/lib/shoko/adapters/output/ui/components/screens/menu_screen_component.rb +107 -0
- data/lib/shoko/adapters/output/ui/components/screens/settings_screen_component.rb +238 -0
- data/lib/shoko/adapters/output/ui/components/sidebar/annotations_tab_renderer.rb +159 -0
- data/lib/shoko/adapters/output/ui/components/sidebar/bookmarks_tab_renderer.rb +139 -0
- data/lib/shoko/adapters/output/ui/components/sidebar/tab_header_component.rb +157 -0
- data/lib/shoko/adapters/output/ui/components/sidebar/toc_tab_renderer.rb +111 -0
- data/lib/shoko/adapters/output/ui/components/sidebar/toc_tab_support.rb +1606 -0
- data/lib/shoko/adapters/output/ui/components/sidebar_panel_component.rb +217 -0
- data/lib/shoko/adapters/output/ui/components/surface.rb +88 -0
- data/lib/shoko/adapters/output/ui/components/tooltip_overlay_component.rb +224 -0
- data/lib/shoko/adapters/output/ui/components/ui/box_drawer.rb +32 -0
- data/lib/shoko/adapters/output/ui/components/ui/list_helpers.rb +33 -0
- data/lib/shoko/adapters/output/ui/components/ui/overlay_layout.rb +79 -0
- data/lib/shoko/adapters/output/ui/components/ui/text_utils.rb +46 -0
- data/lib/shoko/adapters/output/ui/constants/highlighting.rb +21 -0
- data/lib/shoko/adapters/output/ui/constants/messages.rb +12 -0
- data/lib/shoko/adapters/output/ui/constants/themes.rb +79 -0
- data/lib/shoko/adapters/output/ui/constants/ui_constants.rb +85 -0
- data/lib/shoko/adapters/output/ui/rendering/frame_coordinator.rb +42 -0
- data/lib/shoko/adapters/output/ui/rendering/reader_render_coordinator.rb +169 -0
- data/lib/shoko/adapters/output/ui/rendering/render_pipeline.rb +55 -0
- data/lib/shoko/adapters/storage/atomic_file_writer.rb +43 -0
- data/lib/shoko/adapters/storage/background_worker.rb +66 -0
- data/lib/shoko/adapters/storage/book_cache_pipeline.rb +653 -0
- data/lib/shoko/adapters/storage/cache/epub/memory_cache.rb +99 -0
- data/lib/shoko/adapters/storage/cache/epub/persistence.rb +131 -0
- data/lib/shoko/adapters/storage/cache/epub/serializer/deserialize.rb +225 -0
- data/lib/shoko/adapters/storage/cache/epub/serializer/helpers.rb +63 -0
- data/lib/shoko/adapters/storage/cache/epub/serializer/serialize.rb +83 -0
- data/lib/shoko/adapters/storage/cache/epub/serializer.rb +5 -0
- data/lib/shoko/adapters/storage/cache/epub/source_reference.rb +58 -0
- data/lib/shoko/adapters/storage/cache_paths.rb +21 -0
- data/lib/shoko/adapters/storage/cache_pointer_manager.rb +60 -0
- data/lib/shoko/adapters/storage/config_paths.rb +30 -0
- data/lib/shoko/adapters/storage/epub_cache.rb +195 -0
- data/lib/shoko/adapters/storage/file_writer_service.rb +47 -0
- data/lib/shoko/adapters/storage/json_cache_store/chapters.rb +141 -0
- data/lib/shoko/adapters/storage/json_cache_store/layouts.rb +67 -0
- data/lib/shoko/adapters/storage/json_cache_store/manifest.rb +42 -0
- data/lib/shoko/adapters/storage/json_cache_store/payload_helpers.rb +113 -0
- data/lib/shoko/adapters/storage/json_cache_store/resources.rb +84 -0
- data/lib/shoko/adapters/storage/json_cache_store.rb +167 -0
- data/lib/shoko/adapters/storage/lazy_file_string.rb +65 -0
- data/lib/shoko/adapters/storage/pagination_cache.rb +127 -0
- data/lib/shoko/adapters/storage/recent_files.rb +78 -0
- data/lib/shoko/adapters/storage/repositories/annotation_repository.rb +182 -0
- data/lib/shoko/adapters/storage/repositories/base_repository.rb +81 -0
- data/lib/shoko/adapters/storage/repositories/bookmark_repository.rb +132 -0
- data/lib/shoko/adapters/storage/repositories/cached_library_repository.rb +129 -0
- data/lib/shoko/adapters/storage/repositories/config_repository.rb +262 -0
- data/lib/shoko/adapters/storage/repositories/progress_repository.rb +166 -0
- data/lib/shoko/adapters/storage/repositories/storage/annotation_file_store.rb +128 -0
- data/lib/shoko/adapters/storage/repositories/storage/bookmark_file_store.rb +109 -0
- data/lib/shoko/adapters/storage/repositories/storage/file_store_utils.rb +20 -0
- data/lib/shoko/adapters/storage/repositories/storage/progress_file_store.rb +59 -0
- data/lib/shoko/application/annotation_editor_overlay_session.rb +138 -0
- data/lib/shoko/application/cli.rb +134 -0
- data/lib/shoko/application/controllers/menu/input_controller.rb +189 -0
- data/lib/shoko/application/controllers/menu/state_controller.rb +642 -0
- data/lib/shoko/application/controllers/menu_controller.rb +469 -0
- data/lib/shoko/application/controllers/mouseable_reader.rb +377 -0
- data/lib/shoko/application/controllers/reader_controller.rb +449 -0
- data/lib/shoko/application/controllers/state_controller.rb +410 -0
- data/lib/shoko/application/controllers/ui_controller.rb +782 -0
- data/lib/shoko/application/dependency_container.rb +301 -0
- data/lib/shoko/application/infrastructure/event_bus.rb +80 -0
- data/lib/shoko/application/infrastructure/observer_state_store.rb +136 -0
- data/lib/shoko/application/infrastructure/state_store.rb +413 -0
- data/lib/shoko/application/main_menu/menu_progress_presenter.rb +83 -0
- data/lib/shoko/application/pending_jump_handler.rb +122 -0
- data/lib/shoko/application/reader_lifecycle.rb +65 -0
- data/lib/shoko/application/reader_startup_orchestrator.rb +113 -0
- data/lib/shoko/application/selectors/config_selectors.rb +62 -0
- data/lib/shoko/application/selectors/menu_selectors.rb +62 -0
- data/lib/shoko/application/selectors/reader_selectors.rb +186 -0
- data/lib/shoko/application/state/actions/base_action.rb +24 -0
- data/lib/shoko/application/state/actions/quit_to_menu_action.rb +16 -0
- data/lib/shoko/application/state/actions/switch_reader_mode_action.rb +22 -0
- data/lib/shoko/application/state/actions/toggle_view_mode_action.rb +31 -0
- data/lib/shoko/application/state/actions/update_annotation_editor_overlay_action.rb +27 -0
- data/lib/shoko/application/state/actions/update_annotations_action.rb +20 -0
- data/lib/shoko/application/state/actions/update_annotations_overlay_action.rb +27 -0
- data/lib/shoko/application/state/actions/update_bookmarks_action.rb +20 -0
- data/lib/shoko/application/state/actions/update_chapter_action.rb +24 -0
- data/lib/shoko/application/state/actions/update_config_action.rb +22 -0
- data/lib/shoko/application/state/actions/update_field_helpers.rb +26 -0
- data/lib/shoko/application/state/actions/update_menu_action.rb +21 -0
- data/lib/shoko/application/state/actions/update_message_action.rb +35 -0
- data/lib/shoko/application/state/actions/update_page_action.rb +21 -0
- data/lib/shoko/application/state/actions/update_pagination_state_action.rb +21 -0
- data/lib/shoko/application/state/actions/update_popup_menu_action.rb +27 -0
- data/lib/shoko/application/state/actions/update_reader_meta_action.rb +21 -0
- data/lib/shoko/application/state/actions/update_reader_mode_action.rb +20 -0
- data/lib/shoko/application/state/actions/update_rendered_lines_action.rb +40 -0
- data/lib/shoko/application/state/actions/update_selection_action.rb +27 -0
- data/lib/shoko/application/state/actions/update_selections_action.rb +21 -0
- data/lib/shoko/application/state/actions/update_sidebar_action.rb +34 -0
- data/lib/shoko/application/state/actions/update_ui_loading_action.rb +23 -0
- data/lib/shoko/application/ui/reader_view_model_builder.rb +74 -0
- data/lib/shoko/application/ui/view_models/reader_view_model.rb +177 -0
- data/lib/shoko/application/unified_application.rb +48 -0
- data/lib/shoko/application/use_cases/catalog_service.rb +117 -0
- data/lib/shoko/application/use_cases/commands/annotation_editor_commands.rb +105 -0
- data/lib/shoko/application/use_cases/commands/application_commands.rb +208 -0
- data/lib/shoko/application/use_cases/commands/base_command.rb +166 -0
- data/lib/shoko/application/use_cases/commands/bookmark_commands.rb +114 -0
- data/lib/shoko/application/use_cases/commands/conditional_navigation_commands.rb +57 -0
- data/lib/shoko/application/use_cases/commands/menu_commands.rb +170 -0
- data/lib/shoko/application/use_cases/commands/navigation_commands.rb +183 -0
- data/lib/shoko/application/use_cases/commands/reader_commands.rb +46 -0
- data/lib/shoko/application/use_cases/commands/sidebar_commands.rb +55 -0
- data/lib/shoko/application/use_cases/settings_service.rb +123 -0
- data/lib/shoko/core/events/annotation_events.rb +94 -0
- data/lib/shoko/core/events/base_domain_event.rb +169 -0
- data/lib/shoko/core/events/bookmark_events.rb +41 -0
- data/lib/shoko/core/events/domain_event_bus.rb +163 -0
- data/lib/shoko/core/events/progress_events.rb +108 -0
- data/lib/shoko/core/models/bookmark.rb +36 -0
- data/lib/shoko/core/models/bookmark_data.rb +10 -0
- data/lib/shoko/core/models/chapter.rb +25 -0
- data/lib/shoko/core/models/content_block.rb +44 -0
- data/lib/shoko/core/models/reader_settings.rb +20 -0
- data/lib/shoko/core/models/selection_anchor.rb +73 -0
- data/lib/shoko/core/models/toc_entry.rb +14 -0
- data/lib/shoko/core/ports/annotation_repository.rb +0 -0
- data/lib/shoko/core/ports/book_repository.rb +0 -0
- data/lib/shoko/core/ports/book_source.rb +0 -0
- data/lib/shoko/core/ports/bookmark_repository.rb +0 -0
- data/lib/shoko/core/ports/cache.rb +0 -0
- data/lib/shoko/core/ports/input_handler.rb +0 -0
- data/lib/shoko/core/ports/renderer.rb +0 -0
- data/lib/shoko/core/ports/storage.rb +0 -0
- data/lib/shoko/core/services/annotation_service.rb +102 -0
- data/lib/shoko/core/services/base_service.rb +60 -0
- data/lib/shoko/core/services/bookmark_service.rb +267 -0
- data/lib/shoko/core/services/coordinate_service.rb +265 -0
- data/lib/shoko/core/services/layout_service.rb +95 -0
- data/lib/shoko/core/services/navigation/absolute_change_applier.rb +96 -0
- data/lib/shoko/core/services/navigation/absolute_layout.rb +101 -0
- data/lib/shoko/core/services/navigation/absolute_strategy.rb +179 -0
- data/lib/shoko/core/services/navigation/context_builder.rb +52 -0
- data/lib/shoko/core/services/navigation/context_helpers.rb +63 -0
- data/lib/shoko/core/services/navigation/dynamic_change_applier.rb +50 -0
- data/lib/shoko/core/services/navigation/dynamic_strategy.rb +51 -0
- data/lib/shoko/core/services/navigation/image_offset_snapper.rb +150 -0
- data/lib/shoko/core/services/navigation/nav_context.rb +27 -0
- data/lib/shoko/core/services/navigation/state_updater.rb +29 -0
- data/lib/shoko/core/services/navigation/strategy_factory.rb +20 -0
- data/lib/shoko/core/services/navigation_service.rb +150 -0
- data/lib/shoko/core/services/page_calculator_service.rb +242 -0
- data/lib/shoko/core/services/pagination/internal/absolute_page_map_builder.rb +28 -0
- data/lib/shoko/core/services/pagination/internal/chapter_cache.rb +60 -0
- data/lib/shoko/core/services/pagination/internal/dynamic_page_map_builder.rb +157 -0
- data/lib/shoko/core/services/pagination/internal/layout_metrics_calculator.rb +73 -0
- data/lib/shoko/core/services/pagination/internal/page_hydrator.rb +145 -0
- data/lib/shoko/core/services/pagination/internal/pagination_workflow.rb +152 -0
- data/lib/shoko/core/services/pagination/page_info_calculator.rb +247 -0
- data/lib/shoko/core/services/pagination/pagination_cache_preloader.rb +173 -0
- data/lib/shoko/core/services/pagination/pagination_coordinator.rb +202 -0
- data/lib/shoko/core/services/pagination/pagination_orchestrator.rb +291 -0
- data/lib/shoko/core/services/pagination.rb +10 -0
- data/lib/shoko/core/services/progress_helper.rb +22 -0
- data/lib/shoko/core/services/selection_service.rb +126 -0
- data/lib/shoko/core/validator.rb +76 -0
- data/lib/shoko/shared/errors.rb +97 -0
- data/lib/shoko/shared/version.rb +5 -0
- data/lib/shoko/test_support/terminal_double.rb +175 -0
- data/lib/shoko/test_support/test_mode.rb +78 -0
- data/lib/shoko.rb +279 -0
- data/lib/zip.rb +732 -0
- data/zip.rb +5 -0
- metadata +370 -0
|
@@ -0,0 +1,661 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
require 'rexml/document'
|
|
5
|
+
require 'rexml/parsers/pullparser'
|
|
6
|
+
|
|
7
|
+
require_relative '../../../../core/models/content_block.rb'
|
|
8
|
+
require_relative 'html_processor'
|
|
9
|
+
require_relative '../../../output/terminal/terminal_sanitizer.rb'
|
|
10
|
+
require_relative '../../../../shared/errors.rb'
|
|
11
|
+
require_relative '../../../monitoring/logger.rb'
|
|
12
|
+
|
|
13
|
+
module Shoko
|
|
14
|
+
module Adapters::BookSources::Epub::Parsers
|
|
15
|
+
# Parses XHTML content into semantic content blocks + text segments.
|
|
16
|
+
class XHTMLContentParser
|
|
17
|
+
TAG_SETS = begin
|
|
18
|
+
block_types = %w[p div section article aside header footer figure figcaption main].freeze
|
|
19
|
+
heading_types = %w[h1 h2 h3 h4 h5 h6].freeze
|
|
20
|
+
list_types = %w[ul ol].freeze
|
|
21
|
+
list_item = 'li'
|
|
22
|
+
blockquote = 'blockquote'
|
|
23
|
+
pre = 'pre'
|
|
24
|
+
hr = 'hr'
|
|
25
|
+
br = 'br'
|
|
26
|
+
img = 'img'
|
|
27
|
+
table = 'table'
|
|
28
|
+
block_level_elements = (
|
|
29
|
+
block_types +
|
|
30
|
+
heading_types +
|
|
31
|
+
list_types +
|
|
32
|
+
[
|
|
33
|
+
list_item,
|
|
34
|
+
blockquote,
|
|
35
|
+
pre,
|
|
36
|
+
hr,
|
|
37
|
+
table,
|
|
38
|
+
]
|
|
39
|
+
).freeze
|
|
40
|
+
|
|
41
|
+
{
|
|
42
|
+
inline_newline: "\n",
|
|
43
|
+
block_types: block_types,
|
|
44
|
+
heading_types: heading_types,
|
|
45
|
+
list_types: list_types,
|
|
46
|
+
list_item: list_item,
|
|
47
|
+
blockquote: blockquote,
|
|
48
|
+
pre: pre,
|
|
49
|
+
hr: hr,
|
|
50
|
+
br: br,
|
|
51
|
+
img: img,
|
|
52
|
+
table: table,
|
|
53
|
+
block_level_elements: block_level_elements,
|
|
54
|
+
}.freeze
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
WHITESPACE_PATTERN = /\s+/
|
|
58
|
+
XML_ENTITY_NAMES = %w[amp lt gt apos quot].freeze
|
|
59
|
+
|
|
60
|
+
def initialize(html)
|
|
61
|
+
@html = html.to_s
|
|
62
|
+
@segment_builder = XHTMLSegmentBuilder.new(tag_sets: TAG_SETS, whitespace_pattern: WHITESPACE_PATTERN)
|
|
63
|
+
@block_builder = XHTMLBlockBuilder.new(segment_builder: @segment_builder, tag_sets: TAG_SETS)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def parse
|
|
67
|
+
return [] if html_blank?
|
|
68
|
+
|
|
69
|
+
body = parse_body
|
|
70
|
+
return [] unless body
|
|
71
|
+
|
|
72
|
+
build_blocks(body)
|
|
73
|
+
rescue REXML::ParseException => e
|
|
74
|
+
Adapters::Monitoring::Logger.error('Failed to parse chapter HTML', error: e.message)
|
|
75
|
+
fallback_blocks
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
def html_blank?
|
|
81
|
+
@html.strip.empty?
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def build_blocks(body)
|
|
85
|
+
blocks = XHTMLContentTraversal.new(block_builder: @block_builder, tag_sets: TAG_SETS).build(body)
|
|
86
|
+
ensure_blocks_present(body, blocks)
|
|
87
|
+
blocks
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def parse_body
|
|
91
|
+
document = parse_document(@html)
|
|
92
|
+
return nil unless document
|
|
93
|
+
|
|
94
|
+
find_body(document) || document.root
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def parse_document(text)
|
|
98
|
+
safe = Shoko::Adapters::Output::Terminal::TerminalSanitizer.sanitize_xml_source(text.to_s, preserve_newlines: true,
|
|
99
|
+
preserve_tabs: true)
|
|
100
|
+
sanitized = sanitize_for_xml(safe)
|
|
101
|
+
# Preserve whitespace-only text nodes so inline element boundaries
|
|
102
|
+
# don't accidentally collapse words (e.g., <em>foo</em>\n<em>bar</em>).
|
|
103
|
+
# We normalize whitespace later in `normalize_text`.
|
|
104
|
+
REXML::Document.new(sanitized)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def sanitize_for_xml(text)
|
|
108
|
+
text.gsub(/&([A-Za-z][A-Za-z0-9]+);/) do |match|
|
|
109
|
+
sanitize_entity(match)
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def sanitize_entity(match)
|
|
114
|
+
name = Regexp.last_match(1)
|
|
115
|
+
return match if XML_ENTITY_NAMES.include?(name)
|
|
116
|
+
|
|
117
|
+
decoded = Shoko::Adapters::BookSources::Epub::Parsers::HTMLProcessor.decode_entities(match)
|
|
118
|
+
decoded == match ? "&#{name};" : decoded
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def find_body(document)
|
|
122
|
+
root = document&.root
|
|
123
|
+
return nil unless root
|
|
124
|
+
|
|
125
|
+
elements = root.elements
|
|
126
|
+
elements['*[local-name()="body"]'] ||
|
|
127
|
+
elements['body'] ||
|
|
128
|
+
elements['BODY']
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def ensure_blocks_present(body, blocks)
|
|
132
|
+
text_content = body.texts.join.strip
|
|
133
|
+
return if text_content.empty? || blocks.any?
|
|
134
|
+
|
|
135
|
+
Adapters::Monitoring::Logger.error(
|
|
136
|
+
'Formatting produced no blocks',
|
|
137
|
+
source: 'XHTMLContentParser',
|
|
138
|
+
sample: text_content.slice(0, 120)
|
|
139
|
+
)
|
|
140
|
+
raise Shoko::FormattingError.new('chapter', 'normalized block list was empty')
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def fallback_blocks
|
|
144
|
+
text = Shoko::Adapters::BookSources::Epub::Parsers::HTMLProcessor.html_to_text(@html)
|
|
145
|
+
return [] if text.to_s.strip.empty?
|
|
146
|
+
|
|
147
|
+
paragraphs = text.split(/\n{2,}/).map(&:strip).reject(&:empty?)
|
|
148
|
+
paragraphs.map do |paragraph|
|
|
149
|
+
Shoko::Core::Models::ContentBlock.new(
|
|
150
|
+
type: :paragraph,
|
|
151
|
+
segments: [@segment_builder.text_segment(paragraph)],
|
|
152
|
+
metadata: {}
|
|
153
|
+
)
|
|
154
|
+
end
|
|
155
|
+
rescue StandardError
|
|
156
|
+
[]
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Traverses elements and emits block structures.
|
|
161
|
+
class XHTMLContentTraversal
|
|
162
|
+
# Traversal state for list nesting and blockquote context.
|
|
163
|
+
Context = Struct.new(:list_stack, :in_blockquote, keyword_init: true)
|
|
164
|
+
private_constant :Context
|
|
165
|
+
|
|
166
|
+
# Tracks ordered list numbering as the traversal enters list items.
|
|
167
|
+
ListContext = Struct.new(:ordered, :index, keyword_init: true) do
|
|
168
|
+
def marker
|
|
169
|
+
ordered ? "#{index}." : '•'
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def advance
|
|
173
|
+
self.index += 1 if ordered
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
private_constant :ListContext
|
|
177
|
+
|
|
178
|
+
def initialize(block_builder:, tag_sets:)
|
|
179
|
+
@block_builder = block_builder
|
|
180
|
+
@tag_sets = tag_sets
|
|
181
|
+
@blocks = []
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def build(root)
|
|
185
|
+
context = Context.new(list_stack: [], in_blockquote: false)
|
|
186
|
+
traverse_children(root, context)
|
|
187
|
+
@block_builder.compact_blocks(@blocks)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
private
|
|
191
|
+
|
|
192
|
+
attr_reader :block_builder, :tag_sets
|
|
193
|
+
|
|
194
|
+
def traverse_children(node, context)
|
|
195
|
+
node.children.each { |child| handle_node(child, context) }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def handle_node(child, context)
|
|
199
|
+
if child.is_a?(REXML::Element)
|
|
200
|
+
handle_element(child, context)
|
|
201
|
+
elsif child.is_a?(REXML::Text)
|
|
202
|
+
append_text_block(child, context)
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def handle_element(element, context)
|
|
207
|
+
name = element.name.downcase
|
|
208
|
+
return if skip_element?(name)
|
|
209
|
+
|
|
210
|
+
return if append_block_result(block_builder.block_for(name, element, context))
|
|
211
|
+
return if handle_list_element(name, element, context)
|
|
212
|
+
return if handle_container_element(name, element, context)
|
|
213
|
+
|
|
214
|
+
traverse_children(element, context)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def append_block_result(result)
|
|
218
|
+
return false unless result
|
|
219
|
+
|
|
220
|
+
if result.is_a?(Array)
|
|
221
|
+
result.each { |block| append_block(block) }
|
|
222
|
+
else
|
|
223
|
+
append_block(result)
|
|
224
|
+
end
|
|
225
|
+
true
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def handle_list_element(name, element, context)
|
|
229
|
+
list_types = tag_sets[:list_types]
|
|
230
|
+
if list_types.include?(name)
|
|
231
|
+
traverse_list(element, context, ordered: name == 'ol')
|
|
232
|
+
return true
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
return false unless name == tag_sets[:list_item]
|
|
236
|
+
|
|
237
|
+
append_block(block_builder.list_item(element, context))
|
|
238
|
+
true
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def handle_container_element(name, element, context)
|
|
242
|
+
block_types = tag_sets[:block_types]
|
|
243
|
+
block_level = tag_sets[:block_level_elements]
|
|
244
|
+
return false unless block_types.include?(name) || block_builder.block_via_style?(element)
|
|
245
|
+
|
|
246
|
+
if block_builder.contains_block_children?(element, block_level)
|
|
247
|
+
traverse_children(element, context)
|
|
248
|
+
else
|
|
249
|
+
append_block(block_builder.paragraph(element, context))
|
|
250
|
+
end
|
|
251
|
+
true
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def append_text_block(text_node, context)
|
|
255
|
+
segments = block_builder.segments_from_text(text_node.value)
|
|
256
|
+
append_block(block_builder.paragraph_from_segments(segments, context)) if segments
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def traverse_list(element, context, ordered:)
|
|
260
|
+
list_context = ListContext.new(ordered: ordered, index: ordered ? 1 : nil)
|
|
261
|
+
new_context = Context.new(list_stack: context.list_stack + [list_context],
|
|
262
|
+
in_blockquote: context.in_blockquote)
|
|
263
|
+
element.each_element { |child| handle_element(child, new_context) }
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def append_block(block)
|
|
267
|
+
@blocks << block if block
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def skip_element?(name)
|
|
271
|
+
%w[script style].include?(name)
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Builds content blocks and metadata from parsed elements.
|
|
276
|
+
class XHTMLBlockBuilder
|
|
277
|
+
ContentBlock = Shoko::Core::Models::ContentBlock
|
|
278
|
+
|
|
279
|
+
def initialize(segment_builder:, tag_sets:)
|
|
280
|
+
@segments = segment_builder
|
|
281
|
+
@tag_sets = tag_sets
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def block_for(name, element, context)
|
|
285
|
+
heading = heading_block(name, element, context)
|
|
286
|
+
return heading if heading
|
|
287
|
+
|
|
288
|
+
case name
|
|
289
|
+
when @tag_sets[:blockquote]
|
|
290
|
+
quote_block(element, context)
|
|
291
|
+
when @tag_sets[:img]
|
|
292
|
+
image_block(element, context)
|
|
293
|
+
when @tag_sets[:pre]
|
|
294
|
+
preformatted_block(element, context)
|
|
295
|
+
when @tag_sets[:hr]
|
|
296
|
+
separator_block(context)
|
|
297
|
+
when @tag_sets[:table]
|
|
298
|
+
table_blocks(element, context)
|
|
299
|
+
when @tag_sets[:br]
|
|
300
|
+
break_block
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
def list_item(element, context)
|
|
305
|
+
list_stack = context.list_stack
|
|
306
|
+
list_context = list_stack.last
|
|
307
|
+
segments = segments_for(element)
|
|
308
|
+
marker = list_context ? list_context.marker : '•'
|
|
309
|
+
list_context&.advance
|
|
310
|
+
|
|
311
|
+
level = list_stack.length
|
|
312
|
+
metadata = metadata_with_quote(context, marker: marker, level: level)
|
|
313
|
+
ContentBlock.new(type: :list_item, segments: segments, level: level, metadata: metadata)
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def paragraph(element, context)
|
|
317
|
+
segments = segments_for(element)
|
|
318
|
+
return nil if segments.empty?
|
|
319
|
+
|
|
320
|
+
ContentBlock.new(type: :paragraph, segments: segments, metadata: metadata_with_quote(context))
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def paragraph_from_segments(segments, context)
|
|
324
|
+
return nil if segments.nil? || segments.empty?
|
|
325
|
+
|
|
326
|
+
ContentBlock.new(type: :paragraph, segments: segments, metadata: metadata_with_quote(context))
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
def segments_from_text(text)
|
|
330
|
+
segment = @segments.text_segment(text)
|
|
331
|
+
segments = @segments.finalize_segments([segment])
|
|
332
|
+
segments.empty? ? nil : segments
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def compact_blocks(blocks)
|
|
336
|
+
blocks.reject do |block|
|
|
337
|
+
next false if block&.type == :break
|
|
338
|
+
|
|
339
|
+
block.nil? || block.segments.empty? || block.text.strip.empty?
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def block_via_style?(element)
|
|
344
|
+
style = element.attributes['style'].to_s
|
|
345
|
+
/display\s*:\s*(block|list-item)/i.match?(style)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def contains_block_children?(element, block_level_elements)
|
|
349
|
+
element.children.any? do |child|
|
|
350
|
+
next false unless child.is_a?(REXML::Element)
|
|
351
|
+
|
|
352
|
+
name = child.name.to_s.downcase
|
|
353
|
+
block_level_elements.include?(name) || block_via_style?(child)
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
private
|
|
358
|
+
|
|
359
|
+
def heading_block(name, element, context)
|
|
360
|
+
heading_types = @tag_sets[:heading_types]
|
|
361
|
+
return nil unless heading_types.include?(name)
|
|
362
|
+
|
|
363
|
+
level = name.delete('h').to_i
|
|
364
|
+
segments = segments_for(element)
|
|
365
|
+
metadata = metadata_with_quote(context, level: level)
|
|
366
|
+
ContentBlock.new(type: :heading, segments: segments, level: level, metadata: metadata)
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
def quote_block(element, context)
|
|
370
|
+
segments = segments_for(element)
|
|
371
|
+
return nil if segments.empty?
|
|
372
|
+
|
|
373
|
+
metadata = metadata_with_quote(context, quoted: true)
|
|
374
|
+
ContentBlock.new(type: :quote, segments: segments, metadata: metadata)
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
def preformatted_block(element, context)
|
|
378
|
+
target = code_child_for(element) || element
|
|
379
|
+
text = target.texts.join
|
|
380
|
+
return nil if text.to_s.empty?
|
|
381
|
+
|
|
382
|
+
metadata = metadata_with_quote(context, preserve_whitespace: true)
|
|
383
|
+
segment = @segments.text_segment(text, code: true, preserve_whitespace: true)
|
|
384
|
+
ContentBlock.new(type: :code, segments: [segment], metadata: metadata)
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
def image_block(element, context)
|
|
388
|
+
segments = @segments.finalize_segments([@segments.image_placeholder_segment({})])
|
|
389
|
+
return nil if segments.empty?
|
|
390
|
+
|
|
391
|
+
attrs = element.attributes
|
|
392
|
+
metadata = metadata_with_quote(context, image: { src: attrs['src'], alt: attrs['alt'] })
|
|
393
|
+
ContentBlock.new(type: :image, segments: segments, metadata: metadata)
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def separator_block(context)
|
|
397
|
+
metadata = metadata_with_quote(context)
|
|
398
|
+
ContentBlock.new(
|
|
399
|
+
type: :separator,
|
|
400
|
+
segments: [@segments.text_segment('─' * 40)],
|
|
401
|
+
metadata: metadata
|
|
402
|
+
)
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
def table_blocks(element, context)
|
|
406
|
+
rows = collect_descendants(element, 'tr')
|
|
407
|
+
return [] if rows.empty?
|
|
408
|
+
|
|
409
|
+
lines = rows.filter_map { |row| table_row_text(row) }
|
|
410
|
+
return [] if lines.empty?
|
|
411
|
+
|
|
412
|
+
inline_newline = @tag_sets[:inline_newline]
|
|
413
|
+
metadata = metadata_with_quote(context, preserve_whitespace: true)
|
|
414
|
+
block = ContentBlock.new(
|
|
415
|
+
type: :table,
|
|
416
|
+
segments: [@segments.text_segment(lines.join(inline_newline), preserve_whitespace: true)],
|
|
417
|
+
metadata: metadata
|
|
418
|
+
)
|
|
419
|
+
[block]
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
def break_block
|
|
423
|
+
ContentBlock.new(
|
|
424
|
+
type: :break,
|
|
425
|
+
segments: [],
|
|
426
|
+
metadata: { spacer: true }
|
|
427
|
+
)
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
def segments_for(element)
|
|
431
|
+
@segments.finalize_segments(@segments.collect_segments(element))
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
def metadata_with_quote(context, base = {})
|
|
435
|
+
metadata = base.dup
|
|
436
|
+
metadata[:quoted] = true if context.in_blockquote
|
|
437
|
+
metadata
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
def code_child_for(element)
|
|
441
|
+
element.elements.find do |child|
|
|
442
|
+
child.is_a?(REXML::Element) && child.name.casecmp('code').zero?
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
def table_row_text(row)
|
|
447
|
+
cells = row.elements.each_with_object([]) do |cell, acc|
|
|
448
|
+
next unless table_cell?(cell)
|
|
449
|
+
|
|
450
|
+
text = @segments.collect_segments(cell).map(&:text).join.strip
|
|
451
|
+
acc << text unless text.empty?
|
|
452
|
+
end
|
|
453
|
+
cells.empty? ? nil : cells.join(' | ')
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
def table_cell?(element)
|
|
457
|
+
%w[td th].include?(element.name.downcase)
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
def collect_descendants(element, name)
|
|
461
|
+
results = []
|
|
462
|
+
element.each_element do |child|
|
|
463
|
+
results << child if child.name.casecmp(name).zero?
|
|
464
|
+
results.concat(collect_descendants(child, name))
|
|
465
|
+
end
|
|
466
|
+
results
|
|
467
|
+
end
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
# Collects and normalizes inline text segments.
|
|
471
|
+
class XHTMLSegmentBuilder
|
|
472
|
+
TextSegment = Shoko::Core::Models::TextSegment
|
|
473
|
+
|
|
474
|
+
STYLE_MAP = {
|
|
475
|
+
'strong' => { bold: true },
|
|
476
|
+
'b' => { bold: true },
|
|
477
|
+
'em' => { italic: true },
|
|
478
|
+
'i' => { italic: true },
|
|
479
|
+
'u' => { underline: true },
|
|
480
|
+
'code' => { code: true, preserve_whitespace: true },
|
|
481
|
+
'kbd' => { code: true, preserve_whitespace: true },
|
|
482
|
+
'samp' => { code: true, preserve_whitespace: true },
|
|
483
|
+
}.freeze
|
|
484
|
+
|
|
485
|
+
SPAN_STYLE_MATCHERS = {
|
|
486
|
+
bold: /font-weight\s*:\s*bold/i,
|
|
487
|
+
italic: /font-style\s*:\s*italic/i,
|
|
488
|
+
underline: /text-decoration\s*:\s*underline/i,
|
|
489
|
+
}.freeze
|
|
490
|
+
|
|
491
|
+
PLACEHOLDER_TEXT = '[Image]'
|
|
492
|
+
|
|
493
|
+
def initialize(tag_sets:, whitespace_pattern:)
|
|
494
|
+
@br_tag = tag_sets[:br]
|
|
495
|
+
@img_tag = tag_sets[:img]
|
|
496
|
+
@inline_newline = tag_sets[:inline_newline]
|
|
497
|
+
@whitespace_pattern = whitespace_pattern
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
def collect_segments(element, inherited_styles = {})
|
|
501
|
+
element.children.flat_map { |child| segments_for(child, inherited_styles) }
|
|
502
|
+
end
|
|
503
|
+
|
|
504
|
+
def text_segment(text, styles = {})
|
|
505
|
+
TextSegment.new(
|
|
506
|
+
text: normalize_text(text.to_s, styles),
|
|
507
|
+
styles: styles
|
|
508
|
+
)
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
def image_placeholder_segment(inherited_styles)
|
|
512
|
+
placeholder_segment(inherited_styles.merge(dim: true))
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
def inline_image_placeholder_segment(element, inherited_styles)
|
|
516
|
+
attrs = element.attributes
|
|
517
|
+
styles = inherited_styles.merge(
|
|
518
|
+
dim: true,
|
|
519
|
+
inline_image: { src: attrs['src'].to_s, alt: attrs['alt'].to_s.strip }
|
|
520
|
+
)
|
|
521
|
+
placeholder_segment(styles)
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
def finalize_segments(segments)
|
|
525
|
+
segs = compact_segments(segments)
|
|
526
|
+
return [] if segs.empty?
|
|
527
|
+
|
|
528
|
+
segs = collapse_boundary_spaces(segs)
|
|
529
|
+
trim_edge_whitespace(segs)
|
|
530
|
+
end
|
|
531
|
+
|
|
532
|
+
private
|
|
533
|
+
|
|
534
|
+
def segments_for(child, inherited_styles)
|
|
535
|
+
return [] unless child
|
|
536
|
+
|
|
537
|
+
if child.is_a?(REXML::Text)
|
|
538
|
+
segment = text_segment(child.value, inherited_styles)
|
|
539
|
+
segment.text.to_s.empty? ? [] : [segment]
|
|
540
|
+
elsif child.is_a?(REXML::Element)
|
|
541
|
+
segments_for_element(child, inherited_styles)
|
|
542
|
+
else
|
|
543
|
+
[]
|
|
544
|
+
end
|
|
545
|
+
end
|
|
546
|
+
|
|
547
|
+
def segments_for_element(element, inherited_styles)
|
|
548
|
+
name = element.name.downcase
|
|
549
|
+
return [line_break_segment(inherited_styles)] if name == @br_tag
|
|
550
|
+
return [inline_image_placeholder_segment(element, inherited_styles)] if name == @img_tag
|
|
551
|
+
|
|
552
|
+
new_styles = inherited_styles.merge(styles_for(name, element))
|
|
553
|
+
collect_segments(element, new_styles)
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
def line_break_segment(inherited_styles)
|
|
557
|
+
text_segment(@inline_newline, inherited_styles.merge(break: true))
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
def styles_for(name, element)
|
|
561
|
+
return STYLE_MAP[name] if STYLE_MAP.key?(name)
|
|
562
|
+
return span_styles(element) if name == 'span'
|
|
563
|
+
return link_styles(element) if name == 'a'
|
|
564
|
+
|
|
565
|
+
{}
|
|
566
|
+
end
|
|
567
|
+
|
|
568
|
+
def link_styles(element)
|
|
569
|
+
{ link: element.attributes['href'] }.merge(span_styles(element))
|
|
570
|
+
end
|
|
571
|
+
|
|
572
|
+
def span_styles(element)
|
|
573
|
+
style_attr = element.attributes['style']
|
|
574
|
+
return {} if style_attr.to_s.empty?
|
|
575
|
+
|
|
576
|
+
SPAN_STYLE_MATCHERS.each_with_object({}) do |(key, matcher), styles|
|
|
577
|
+
styles[key] = true if matcher.match?(style_attr)
|
|
578
|
+
end
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
def normalize_text(text, styles)
|
|
582
|
+
decoded = decode_text(text)
|
|
583
|
+
return decoded if preserve_whitespace?(styles)
|
|
584
|
+
return normalize_break(decoded) if styles[:break]
|
|
585
|
+
|
|
586
|
+
normalize_whitespace(decoded)
|
|
587
|
+
end
|
|
588
|
+
|
|
589
|
+
def decode_text(text)
|
|
590
|
+
decoded = Shoko::Adapters::BookSources::Epub::Parsers::HTMLProcessor.decode_entities(text)
|
|
591
|
+
Shoko::Adapters::Output::Terminal::TerminalSanitizer.sanitize(decoded, preserve_newlines: true, preserve_tabs: true)
|
|
592
|
+
end
|
|
593
|
+
|
|
594
|
+
def preserve_whitespace?(styles)
|
|
595
|
+
styles[:code] || styles[:preserve_whitespace]
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
def normalize_break(text)
|
|
599
|
+
text == @inline_newline ? @inline_newline : text
|
|
600
|
+
end
|
|
601
|
+
|
|
602
|
+
def normalize_whitespace(text)
|
|
603
|
+
text.delete("\r").tr("\n", ' ').gsub(@whitespace_pattern, ' ')
|
|
604
|
+
end
|
|
605
|
+
|
|
606
|
+
def placeholder_segment(styles)
|
|
607
|
+
text_segment(" #{PLACEHOLDER_TEXT} ", styles)
|
|
608
|
+
end
|
|
609
|
+
|
|
610
|
+
def compact_segments(segments)
|
|
611
|
+
Array(segments).compact.reject { |segment| segment_text(segment).empty? }
|
|
612
|
+
end
|
|
613
|
+
|
|
614
|
+
def collapse_boundary_spaces(segments)
|
|
615
|
+
out = [segments.first]
|
|
616
|
+
segments.drop(1).each do |segment|
|
|
617
|
+
previous = out.last
|
|
618
|
+
adjusted = adjust_leading_space(previous, segment)
|
|
619
|
+
next unless adjusted
|
|
620
|
+
|
|
621
|
+
out << adjusted unless segment_text(adjusted).empty?
|
|
622
|
+
end
|
|
623
|
+
out
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
def adjust_leading_space(previous, segment)
|
|
627
|
+
prev_text = segment_text(previous)
|
|
628
|
+
cur_text = segment_text(segment)
|
|
629
|
+
return segment unless prev_text.end_with?(' ') && cur_text.start_with?(' ')
|
|
630
|
+
|
|
631
|
+
trimmed = cur_text.sub(/\A +/, '')
|
|
632
|
+
return nil if trimmed.empty?
|
|
633
|
+
|
|
634
|
+
TextSegment.new(text: trimmed, styles: segment.styles)
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
def trim_edge_whitespace(segments)
|
|
638
|
+
segs = segments.dup
|
|
639
|
+
return [] if segs.empty?
|
|
640
|
+
|
|
641
|
+
segs[0] = trim_segment_start(segs[0])
|
|
642
|
+
segs[-1] = trim_segment_end(segs[-1])
|
|
643
|
+
segs.reject { |segment| segment_text(segment).empty? }
|
|
644
|
+
end
|
|
645
|
+
|
|
646
|
+
def trim_segment_start(segment)
|
|
647
|
+
text = segment_text(segment).sub(/\A\s+/, '')
|
|
648
|
+
TextSegment.new(text: text, styles: segment.styles)
|
|
649
|
+
end
|
|
650
|
+
|
|
651
|
+
def trim_segment_end(segment)
|
|
652
|
+
text = segment_text(segment).sub(/\s+\z/, '')
|
|
653
|
+
TextSegment.new(text: text, styles: segment.styles)
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
def segment_text(segment)
|
|
657
|
+
segment.text.to_s
|
|
658
|
+
end
|
|
659
|
+
end
|
|
660
|
+
end
|
|
661
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../../../output/terminal/terminal_sanitizer.rb'
|
|
4
|
+
|
|
5
|
+
module Shoko
|
|
6
|
+
module Adapters::BookSources::Epub::Parsers
|
|
7
|
+
# Normalizes XML or XHTML text into UTF-8 and sanitizes control sequences.
|
|
8
|
+
module XmlTextNormalizer
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
def normalize(text)
|
|
12
|
+
bytes = String(text).dup
|
|
13
|
+
bytes.force_encoding(Encoding::BINARY)
|
|
14
|
+
bytes = bytes.delete_prefix("\xEF\xBB\xBF".b)
|
|
15
|
+
|
|
16
|
+
declared = bytes[/\A\s*<\?xml[^>]*encoding=["']([^"']+)["']/i, 1]
|
|
17
|
+
encoding = begin
|
|
18
|
+
declared ? Encoding.find(declared) : Encoding::UTF_8
|
|
19
|
+
rescue StandardError
|
|
20
|
+
Encoding::UTF_8
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
normalized = bytes.dup
|
|
24
|
+
normalized.force_encoding(encoding)
|
|
25
|
+
normalized = normalized.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "\uFFFD")
|
|
26
|
+
normalized = normalized.delete_prefix("\uFEFF")
|
|
27
|
+
Shoko::Adapters::Output::Terminal::TerminalSanitizer.sanitize_xml_source(
|
|
28
|
+
normalized,
|
|
29
|
+
preserve_newlines: true,
|
|
30
|
+
preserve_tabs: true
|
|
31
|
+
)
|
|
32
|
+
rescue StandardError
|
|
33
|
+
Shoko::Adapters::Output::Terminal::TerminalSanitizer.sanitize_xml_source(
|
|
34
|
+
text.to_s,
|
|
35
|
+
preserve_newlines: true,
|
|
36
|
+
preserve_tabs: true
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|