jekyll-l10n 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +94 -0
- data/lib/jekyll-l10n/constants.rb +136 -0
- data/lib/jekyll-l10n/errors.rb +60 -0
- data/lib/jekyll-l10n/extraction/compendium_merger.rb +142 -0
- data/lib/jekyll-l10n/extraction/compendium_translator.rb +138 -0
- data/lib/jekyll-l10n/extraction/config_loader.rb +114 -0
- data/lib/jekyll-l10n/extraction/dom_attribute_extractor.rb +69 -0
- data/lib/jekyll-l10n/extraction/dom_text_extractor.rb +89 -0
- data/lib/jekyll-l10n/extraction/extractor.rb +153 -0
- data/lib/jekyll-l10n/extraction/html_string_extractor.rb +103 -0
- data/lib/jekyll-l10n/extraction/logger.rb +48 -0
- data/lib/jekyll-l10n/extraction/result_saver.rb +95 -0
- data/lib/jekyll-l10n/jekyll/file_sync.rb +110 -0
- data/lib/jekyll-l10n/jekyll/generator.rb +106 -0
- data/lib/jekyll-l10n/jekyll/localized_page.rb +150 -0
- data/lib/jekyll-l10n/jekyll/localized_page_mapper.rb +51 -0
- data/lib/jekyll-l10n/jekyll/page_locator.rb +59 -0
- data/lib/jekyll-l10n/jekyll/page_writer.rb +120 -0
- data/lib/jekyll-l10n/jekyll/post_write_html_reprocessor.rb +118 -0
- data/lib/jekyll-l10n/jekyll/post_write_processor.rb +71 -0
- data/lib/jekyll-l10n/jekyll/regeneration_checker.rb +123 -0
- data/lib/jekyll-l10n/jekyll/url_filter.rb +199 -0
- data/lib/jekyll-l10n/po_file/loader.rb +64 -0
- data/lib/jekyll-l10n/po_file/manager.rb +160 -0
- data/lib/jekyll-l10n/po_file/merger.rb +80 -0
- data/lib/jekyll-l10n/po_file/path_builder.rb +42 -0
- data/lib/jekyll-l10n/po_file/reader.rb +518 -0
- data/lib/jekyll-l10n/po_file/writer.rb +232 -0
- data/lib/jekyll-l10n/translation/block_text_extractor.rb +56 -0
- data/lib/jekyll-l10n/translation/html_translator.rb +229 -0
- data/lib/jekyll-l10n/translation/libre_translator.rb +226 -0
- data/lib/jekyll-l10n/translation/page_translation_loader.rb +99 -0
- data/lib/jekyll-l10n/translation/translator.rb +179 -0
- data/lib/jekyll-l10n/utils/debug_logger.rb +153 -0
- data/lib/jekyll-l10n/utils/error_handler.rb +67 -0
- data/lib/jekyll-l10n/utils/external_link_icon_preserver.rb +122 -0
- data/lib/jekyll-l10n/utils/file_operations.rb +55 -0
- data/lib/jekyll-l10n/utils/html_elements.rb +34 -0
- data/lib/jekyll-l10n/utils/html_parser.rb +52 -0
- data/lib/jekyll-l10n/utils/html_text_utils.rb +131 -0
- data/lib/jekyll-l10n/utils/logger_formatter.rb +114 -0
- data/lib/jekyll-l10n/utils/page_locales_config.rb +344 -0
- data/lib/jekyll-l10n/utils/po_entry_converter.rb +111 -0
- data/lib/jekyll-l10n/utils/site_config_accessor.rb +51 -0
- data/lib/jekyll-l10n/utils/text_normalizer.rb +47 -0
- data/lib/jekyll-l10n/utils/text_validator.rb +35 -0
- data/lib/jekyll-l10n/utils/translation_resolver.rb +115 -0
- data/lib/jekyll-l10n/utils/url_path_builder.rb +65 -0
- data/lib/jekyll-l10n/utils/url_transformer.rb +141 -0
- data/lib/jekyll-l10n/utils/xpath_reference_generator.rb +45 -0
- data/lib/jekyll-l10n/version.rb +10 -0
- data/lib/jekyll-l10n.rb +268 -0
- metadata +200 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../utils/page_locales_config"
|
|
4
|
+
|
|
5
|
+
module Jekyll
|
|
6
|
+
module L10n
|
|
7
|
+
# Loads and validates extraction configuration for files during build.
|
|
8
|
+
#
|
|
9
|
+
# ExtractionConfigLoader finds extraction configuration for generated HTML
|
|
10
|
+
# files by matching them against Jekyll site pages. It validates whether
|
|
11
|
+
# extraction is enabled and configured for a file, loads page-specific
|
|
12
|
+
# settings, and identifies files to skip (localized page variants).
|
|
13
|
+
#
|
|
14
|
+
# Key responsibilities:
|
|
15
|
+
# * Match generated HTML files to Jekyll site pages
|
|
16
|
+
# * Validate that extraction is enabled for a file
|
|
17
|
+
# * Load page-specific extraction configuration
|
|
18
|
+
# * Identify localized page variants to skip
|
|
19
|
+
# * Extract CSS selectors for element exclusion
|
|
20
|
+
#
|
|
21
|
+
# @example
|
|
22
|
+
# loader = ExtractionConfigLoader.new(site, '_site')
|
|
23
|
+
# if loader.valid_for_extraction?(file_path)
|
|
24
|
+
# config = loader.load_page_config(file_path)
|
|
25
|
+
# end
|
|
26
|
+
class ExtractionConfigLoader
|
|
27
|
+
# Initialize a new ExtractionConfigLoader.
|
|
28
|
+
#
|
|
29
|
+
# @param site [Jekyll::Site] Jekyll site object
|
|
30
|
+
# @param dest [String] Destination build directory
|
|
31
|
+
def initialize(site, dest)
|
|
32
|
+
@site = site
|
|
33
|
+
@dest = dest
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Check if a file is valid for extraction.
|
|
37
|
+
#
|
|
38
|
+
# Matches the file to a Jekyll page and verifies extraction is enabled
|
|
39
|
+
# for that page in the configuration.
|
|
40
|
+
#
|
|
41
|
+
# @param file_path [String] Path to file to check
|
|
42
|
+
# @return [Boolean] True if file matches a page with extraction enabled
|
|
43
|
+
def valid_for_extraction?(file_path)
|
|
44
|
+
page_config = find_page_config_for_file(file_path)
|
|
45
|
+
return false unless page_config
|
|
46
|
+
|
|
47
|
+
config = PageLocalesConfig.new(page_config)
|
|
48
|
+
config.enabled? && config.extract_on_build?
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Load extraction configuration for a file.
|
|
52
|
+
#
|
|
53
|
+
# Finds the Jekyll page matching this file and returns its localization
|
|
54
|
+
# configuration wrapped in PageLocalesConfig.
|
|
55
|
+
#
|
|
56
|
+
# @param file_path [String] Path to file
|
|
57
|
+
# @return [PageLocalesConfig] Localization configuration for the page
|
|
58
|
+
def load_page_config(file_path)
|
|
59
|
+
page_config = find_page_config_for_file(file_path)
|
|
60
|
+
PageLocalesConfig.new(page_config)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Find page configuration for a file.
|
|
64
|
+
#
|
|
65
|
+
# Matches a generated file path to a Jekyll page with extraction enabled.
|
|
66
|
+
# Returns the page's front matter data, or nil if no match found.
|
|
67
|
+
#
|
|
68
|
+
# @param file_path [String] Path to generated file
|
|
69
|
+
# @return [Hash, nil] Page front matter data if found, nil otherwise
|
|
70
|
+
def find_page_config_for_file(file_path)
|
|
71
|
+
@site.pages.each do |page|
|
|
72
|
+
next unless page.data["with_locales"] == true
|
|
73
|
+
|
|
74
|
+
page_output = page.output_ext ? page.destination("") : page.destination("/")
|
|
75
|
+
next unless file_path.end_with?(page_output.sub(%r!/$!, "/index.html"))
|
|
76
|
+
|
|
77
|
+
return page.data
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
nil
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Check if a file is a localized page variant that should be skipped.
|
|
84
|
+
#
|
|
85
|
+
# Localized pages are generated copies in locale subdirectories that
|
|
86
|
+
# shouldn't be re-extracted (extraction happens on original pages only).
|
|
87
|
+
#
|
|
88
|
+
# @param file_path [String] Path to file to check
|
|
89
|
+
# @return [Boolean] True if file is in a locale subdirectory
|
|
90
|
+
def skip_localized_page?(file_path)
|
|
91
|
+
relative_path = file_path.sub(@dest, "")
|
|
92
|
+
|
|
93
|
+
all_locales = @site.pages.map do |p|
|
|
94
|
+
p.data.dig("with_locales_data", "locales") || []
|
|
95
|
+
end
|
|
96
|
+
all_locales.flatten!
|
|
97
|
+
all_locales.uniq!
|
|
98
|
+
all_locales.any? { |locale| relative_path.start_with?("/#{locale}/") }
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Extract CSS selectors for element exclusion from configuration.
|
|
102
|
+
#
|
|
103
|
+
# Returns CSS selectors of elements to exclude from extraction (e.g.,
|
|
104
|
+
# script, style, code blocks). Defaults to sensible defaults if not configured.
|
|
105
|
+
#
|
|
106
|
+
# @param config [Hash] Page front matter data
|
|
107
|
+
# @return [Array<String>] CSS selectors for excluded elements
|
|
108
|
+
def extract_exclude_selectors(config)
|
|
109
|
+
config.data.dig("with_locales_data", "extraction", "exclude_selectors") ||
|
|
110
|
+
["script", "style", "code.language-plaintext", "pre code"]
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../utils/xpath_reference_generator"
|
|
4
|
+
require_relative "../utils/text_validator"
|
|
5
|
+
|
|
6
|
+
module Jekyll
|
|
7
|
+
module L10n
|
|
8
|
+
# Extracts HTML attribute values from elements for translation.
|
|
9
|
+
#
|
|
10
|
+
# DomAttributeExtractor identifies and extracts values from configurable HTML
|
|
11
|
+
# attributes (title, alt, aria-label, placeholder, aria-description) on DOM
|
|
12
|
+
# elements. It validates extracted values and generates file location references
|
|
13
|
+
# that include the attribute name for precise debugging and reference.
|
|
14
|
+
#
|
|
15
|
+
# Key responsibilities:
|
|
16
|
+
# * Extract attribute values from DOM elements
|
|
17
|
+
# * Filter extractable attributes by whitelist
|
|
18
|
+
# * Validate attribute values (minimum length, non-numeric)
|
|
19
|
+
# * Generate attribute-specific file location references
|
|
20
|
+
# * Return entries ready for PO file format
|
|
21
|
+
#
|
|
22
|
+
# @example
|
|
23
|
+
# entries = DomAttributeExtractor.extract(node, 'docs/index.html', '_site',
|
|
24
|
+
# ['title', 'alt', 'aria-label'])
|
|
25
|
+
# # Returns array of extraction entries for all valid attribute values
|
|
26
|
+
module DomAttributeExtractor
|
|
27
|
+
extend self
|
|
28
|
+
|
|
29
|
+
# Extract attribute values from an HTML element.
|
|
30
|
+
#
|
|
31
|
+
# Returns empty array if element is not an element node. For element nodes,
|
|
32
|
+
# identifies all specified translatable attributes with non-empty values and
|
|
33
|
+
# returns extraction entries for each, including attribute name in the reference.
|
|
34
|
+
#
|
|
35
|
+
# @param node [Nokogiri::XML::Element] DOM element to extract from
|
|
36
|
+
# @param file_path [String] Source file path (for file location reference)
|
|
37
|
+
# @param dest [String] Destination directory (for file location reference)
|
|
38
|
+
# @param translatable_attrs [Array<String>] Attribute names to extract
|
|
39
|
+
# (e.g., ['title', 'alt', 'aria-label', 'placeholder', 'aria-description'])
|
|
40
|
+
# @return [Array<Hash>] Array of extraction entries, each containing:
|
|
41
|
+
# - :msgid [String] The attribute value to translate
|
|
42
|
+
# - :msgstr [String] Empty string (to be filled by translator)
|
|
43
|
+
# - :reference [String] File location reference including attribute name
|
|
44
|
+
def extract(node, file_path, dest, translatable_attrs)
|
|
45
|
+
return [] unless node.element?
|
|
46
|
+
|
|
47
|
+
attrs = extract_node_attributes(node, translatable_attrs)
|
|
48
|
+
attrs.map do |attr_text, attr_name|
|
|
49
|
+
reference = XPathReferenceGenerator.generate(node, file_path, dest, attr_name)
|
|
50
|
+
{ :msgid => attr_text, :msgstr => "", :reference => reference }
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def extract_node_attributes(node, translatable_attrs)
|
|
55
|
+
attrs = {}
|
|
56
|
+
|
|
57
|
+
translatable_attrs.each do |attr_name|
|
|
58
|
+
value = node[attr_name]
|
|
59
|
+
next if value.nil? || value.empty?
|
|
60
|
+
|
|
61
|
+
value = value.strip
|
|
62
|
+
attrs[value] = attr_name if TextValidator.valid?(value)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
attrs
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../utils/text_normalizer"
|
|
4
|
+
require_relative "../utils/xpath_reference_generator"
|
|
5
|
+
require_relative "../utils/html_text_utils"
|
|
6
|
+
require_relative "../utils/html_elements"
|
|
7
|
+
require_relative "../utils/text_validator"
|
|
8
|
+
|
|
9
|
+
module Jekyll
|
|
10
|
+
module L10n
|
|
11
|
+
# Extracts text content from HTML elements for translation.
|
|
12
|
+
#
|
|
13
|
+
# DomTextExtractor identifies content-bearing HTML elements (paragraphs, headings,
|
|
14
|
+
# list items, etc.) and extracts their text content while preserving inline HTML
|
|
15
|
+
# tags. It validates extracted text and generates file location references for
|
|
16
|
+
# debugging. Text is extracted from elements that contain text nodes or inline
|
|
17
|
+
# elements, but not from elements containing only block-level children.
|
|
18
|
+
#
|
|
19
|
+
# Key responsibilities:
|
|
20
|
+
# * Identify extractable content elements (p, h1-h6, li, blockquote, etc.)
|
|
21
|
+
# * Extract text while preserving inline HTML structure
|
|
22
|
+
# * Skip elements containing only block-level children
|
|
23
|
+
# * Validate extracted text (minimum length, non-numeric)
|
|
24
|
+
# * Generate file location references for extracted strings
|
|
25
|
+
#
|
|
26
|
+
# @example
|
|
27
|
+
# entry = DomTextExtractor.extract(node, 'docs/index.html', '_site')
|
|
28
|
+
# # Returns hash with :msgid, :msgstr, :reference if valid text found
|
|
29
|
+
module DomTextExtractor
|
|
30
|
+
extend self
|
|
31
|
+
|
|
32
|
+
# Extract text content from an HTML element.
|
|
33
|
+
#
|
|
34
|
+
# Returns nil if element is not extractable (not a content element) or if
|
|
35
|
+
# extracted text fails validation (too short, numeric-only, etc.). For valid
|
|
36
|
+
# text, returns hash with msgid, empty msgstr, and file location reference for
|
|
37
|
+
# debugging.
|
|
38
|
+
#
|
|
39
|
+
# @param node [Nokogiri::XML::Element] DOM element to extract from
|
|
40
|
+
# @param file_path [String] Source file path (for file location reference)
|
|
41
|
+
# @param dest [String] Destination directory (for file location reference)
|
|
42
|
+
# @return [Hash, nil] Hash with :msgid, :msgstr, :reference if valid text found,
|
|
43
|
+
# nil if element is not extractable or text fails validation
|
|
44
|
+
def extract(node, file_path, dest)
|
|
45
|
+
return nil unless extractable?(node)
|
|
46
|
+
|
|
47
|
+
text = extract_block_text(node)
|
|
48
|
+
return nil if text.nil?
|
|
49
|
+
|
|
50
|
+
reference = XPathReferenceGenerator.generate(node, file_path, dest)
|
|
51
|
+
{ :msgid => text, :msgstr => "", :reference => reference }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def extractable?(node)
|
|
55
|
+
node.element? && HtmlTextUtils::CONTENT_ELEMENTS.include?(node.name)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def extract_block_text(node)
|
|
59
|
+
return nil if only_contains_block_elements?(node)
|
|
60
|
+
|
|
61
|
+
text = HtmlTextUtils.extract_with_inline_tags(node)
|
|
62
|
+
TextValidator.valid?(text) ? text : nil
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def only_contains_block_elements?(node)
|
|
66
|
+
node.children.each do |child|
|
|
67
|
+
return false if non_empty_text?(child)
|
|
68
|
+
return false if non_block_element?(child)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
block_element_children?(node)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
def non_empty_text?(child)
|
|
77
|
+
child.text? && !child.content.strip.empty?
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def non_block_element?(child)
|
|
81
|
+
child.element? && !HtmlTextUtils::ALL_BLOCK_ELEMENTS.include?(child.name)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def block_element_children?(node)
|
|
85
|
+
node.children.any? { |c| c.element? && HtmlTextUtils::ALL_BLOCK_ELEMENTS.include?(c.name) }
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "config_loader"
|
|
4
|
+
require_relative "result_saver"
|
|
5
|
+
require_relative "logger"
|
|
6
|
+
require_relative "html_string_extractor"
|
|
7
|
+
require_relative "../utils/file_operations"
|
|
8
|
+
require_relative "../utils/site_config_accessor"
|
|
9
|
+
|
|
10
|
+
module Jekyll
|
|
11
|
+
module L10n
|
|
12
|
+
# String Extraction Orchestrator - Finds translatable strings in generated HTML
|
|
13
|
+
#
|
|
14
|
+
# The Extractor is the main entry point for the string extraction workflow. It scans all
|
|
15
|
+
# generated HTML files after Jekyll's build, identifies translatable content (text nodes
|
|
16
|
+
# and configurable HTML attributes), and creates or updates GNU Gettext PO files with
|
|
17
|
+
# the extracted strings.
|
|
18
|
+
#
|
|
19
|
+
# The extraction workflow:
|
|
20
|
+
# 1. Scans all HTML files in Jekyll output directory (_site/)
|
|
21
|
+
# 2. For each HTML file, extracts translatable text and attributes
|
|
22
|
+
# 3. Normalizes text for consistent matching across builds
|
|
23
|
+
# 4. Creates or updates page-specific PO files in _locales/ directory
|
|
24
|
+
# 5. Optionally applies automatic translations via LibreTranslate API
|
|
25
|
+
#
|
|
26
|
+
# Key responsibilities:
|
|
27
|
+
# - Load and validate extraction configuration from pages
|
|
28
|
+
# - Extract text and attributes from HTML with file location references
|
|
29
|
+
# - Create and update PO files with extracted strings
|
|
30
|
+
# - Log extraction statistics and progress
|
|
31
|
+
# - Coordinate with LibreTranslate for automatic translation
|
|
32
|
+
#
|
|
33
|
+
# @example Usage (typically invoked by PostWriteProcessor)
|
|
34
|
+
# extractor = Extractor.new(site)
|
|
35
|
+
# result = extractor.extract_site
|
|
36
|
+
# # => { files_processed: 42, strings_extracted: 237, po_files_created: 3 }
|
|
37
|
+
#
|
|
38
|
+
# @see Jekyll::L10n::ExtractionResultSaver for PO file creation and updates
|
|
39
|
+
# @see Jekyll::L10n::HtmlStringExtractor for text and attribute extraction
|
|
40
|
+
#
|
|
41
|
+
class Extractor
|
|
42
|
+
# @!attribute [r] site
|
|
43
|
+
# The Jekyll site object with generated pages
|
|
44
|
+
# @return [Jekyll::Site]
|
|
45
|
+
attr_reader :site
|
|
46
|
+
|
|
47
|
+
# Initialize the string extractor
|
|
48
|
+
#
|
|
49
|
+
# Sets up configuration and result saving infrastructure for extraction.
|
|
50
|
+
#
|
|
51
|
+
# @param site [Jekyll::Site] The Jekyll site object
|
|
52
|
+
def initialize(site)
|
|
53
|
+
@site = site
|
|
54
|
+
@source = SiteConfigAccessor.source(@site)
|
|
55
|
+
@dest = SiteConfigAccessor.dest(@site)
|
|
56
|
+
@config_loader = ExtractionConfigLoader.new(@site, @dest)
|
|
57
|
+
@result_saver = ExtractionResultSaver.new(@site)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Extract all translatable strings from the generated site
|
|
61
|
+
#
|
|
62
|
+
# Main entry point for extraction. Scans all HTML files in the build output,
|
|
63
|
+
# extracts translatable strings and attributes, creates/updates PO files,
|
|
64
|
+
# and optionally translates strings via LibreTranslate API.
|
|
65
|
+
#
|
|
66
|
+
# @return [Hash<Symbol, Integer>] Statistics hash with keys:
|
|
67
|
+
# - :files_processed - Number of HTML files processed
|
|
68
|
+
# - :strings_extracted - Total strings extracted
|
|
69
|
+
# - :po_files_created - Number of PO files created/updated
|
|
70
|
+
# @example
|
|
71
|
+
# result = extractor.extract_site
|
|
72
|
+
# puts "Processed #{result[:files_processed]} files"
|
|
73
|
+
def extract_site
|
|
74
|
+
Jekyll.logger.info "Localization", "Extracting translatable strings..."
|
|
75
|
+
start_time = Time.now
|
|
76
|
+
stats = process_all_html_files
|
|
77
|
+
translate_all_compendia
|
|
78
|
+
ExtractionLogger.log_summary(stats, Time.now - start_time)
|
|
79
|
+
stats
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def process_all_html_files
|
|
83
|
+
stats = { :files_processed => 0, :strings_extracted => 0, :po_files_created => 0 }
|
|
84
|
+
html_files = Dir.glob(File.join(@dest, "**", "*.html"))
|
|
85
|
+
|
|
86
|
+
html_files.each do |file_path|
|
|
87
|
+
next if @config_loader.skip_localized_page?(file_path)
|
|
88
|
+
|
|
89
|
+
file_stats = process_file(file_path)
|
|
90
|
+
stats[:files_processed] += file_stats[:files_processed]
|
|
91
|
+
stats[:strings_extracted] += file_stats[:strings_extracted]
|
|
92
|
+
stats[:po_files_created] += file_stats[:po_files_created]
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
stats
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def translate_all_compendia
|
|
99
|
+
config = find_libretranslate_config
|
|
100
|
+
return unless config
|
|
101
|
+
|
|
102
|
+
@result_saver.translate_compendia(config)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def process_file(file_path)
|
|
106
|
+
return default_stats unless @config_loader.valid_for_extraction?(file_path)
|
|
107
|
+
|
|
108
|
+
config = @config_loader.load_page_config(file_path)
|
|
109
|
+
entries = extract_strings_from_file(file_path, config)
|
|
110
|
+
return default_stats if entries.empty?
|
|
111
|
+
|
|
112
|
+
page_path = construct_page_path(file_path)
|
|
113
|
+
@result_saver.save_results(config, entries, page_path)
|
|
114
|
+
rescue StandardError => e
|
|
115
|
+
ExtractionLogger.log_error(file_path, e)
|
|
116
|
+
default_stats
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def default_stats
|
|
120
|
+
{ :files_processed => 0, :strings_extracted => 0, :po_files_created => 0 }
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def extract_strings_from_file(file_path, config)
|
|
124
|
+
return [] unless File.exist?(file_path)
|
|
125
|
+
|
|
126
|
+
html = FileOperations.read_utf8(file_path)
|
|
127
|
+
exclude_selectors = @config_loader.extract_exclude_selectors(config)
|
|
128
|
+
|
|
129
|
+
extractor = HtmlStringExtractor.new(config.translatable_attributes, exclude_selectors)
|
|
130
|
+
extractor.extract(html, @dest, file_path)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def find_libretranslate_config
|
|
134
|
+
return nil unless @site.respond_to?(:pages)
|
|
135
|
+
|
|
136
|
+
@site.pages.each do |page|
|
|
137
|
+
next unless page.data["with_locales"] == true
|
|
138
|
+
|
|
139
|
+
config = @config_loader.load_page_config(page.destination(""))
|
|
140
|
+
return config if config.libretranslate_enabled?
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
nil
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
private
|
|
147
|
+
|
|
148
|
+
def construct_page_path(file_path)
|
|
149
|
+
file_path.sub("#{@dest}/", "")
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "dom_text_extractor"
|
|
4
|
+
require_relative "dom_attribute_extractor"
|
|
5
|
+
require_relative "logger"
|
|
6
|
+
|
|
7
|
+
module Jekyll
|
|
8
|
+
module L10n
|
|
9
|
+
# Extracts translatable strings from HTML documents for localization.
|
|
10
|
+
#
|
|
11
|
+
# HtmlStringExtractor walks the DOM tree of parsed HTML and extracts text content
|
|
12
|
+
# from content elements and values from configurable HTML attributes. It deduplicates
|
|
13
|
+
# entries by msgid and generates file location references for each extraction to aid
|
|
14
|
+
# in debugging and tracking. Entries are excluded based on CSS selectors.
|
|
15
|
+
#
|
|
16
|
+
# Key responsibilities:
|
|
17
|
+
# * Parse HTML into DOM tree
|
|
18
|
+
# * Walk DOM recursively to find translatable content
|
|
19
|
+
# * Extract text from content elements (p, h1-h6, li, etc.)
|
|
20
|
+
# * Extract attribute values (title, alt, aria-label, etc.)
|
|
21
|
+
# * Generate file location references for each extracted string
|
|
22
|
+
# * Skip elements matching exclude selectors
|
|
23
|
+
# * Deduplicate entries by msgid
|
|
24
|
+
#
|
|
25
|
+
# @example
|
|
26
|
+
# extractor = HtmlStringExtractor.new(['title', 'alt'], ['script', 'style'])
|
|
27
|
+
# entries = extractor.extract(html_content, '_site', 'docs/index.html')
|
|
28
|
+
# # Returns array of hash entries with :msgid, :msgstr, :reference keys
|
|
29
|
+
class HtmlStringExtractor
|
|
30
|
+
attr_reader :translatable_attrs, :exclude_selectors
|
|
31
|
+
|
|
32
|
+
# Initialize a new HtmlStringExtractor.
|
|
33
|
+
#
|
|
34
|
+
# @param translatable_attrs [Array<String>] HTML attributes to extract
|
|
35
|
+
# (e.g., ['title', 'alt', 'aria-label', 'placeholder', 'aria-description'])
|
|
36
|
+
# @param exclude_selectors [Array<String>] CSS selectors for elements to skip
|
|
37
|
+
# during extraction (e.g., ['script', 'style', '.no-translate'])
|
|
38
|
+
def initialize(translatable_attrs, exclude_selectors)
|
|
39
|
+
@translatable_attrs = translatable_attrs
|
|
40
|
+
@exclude_selectors = exclude_selectors
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Extract translatable strings from HTML.
|
|
44
|
+
#
|
|
45
|
+
# Walks the DOM tree and extracts text nodes from content elements and values
|
|
46
|
+
# from specified attributes. Each extraction is assigned a file location reference
|
|
47
|
+
# for debugging. Entries are deduplicated by msgid (multiple occurrences of same
|
|
48
|
+
# text yield a single entry).
|
|
49
|
+
#
|
|
50
|
+
# @param html [String] HTML content to extract from
|
|
51
|
+
# @param dest [String] Destination directory path (used in file location reference generation)
|
|
52
|
+
# @param file_path [String] Path to source file (used in file location reference generation)
|
|
53
|
+
# @return [Array<Hash>] Array of extraction entries, each containing:
|
|
54
|
+
# - :msgid [String] The text or attribute value to translate
|
|
55
|
+
# - :msgstr [String] Empty string (to be filled by translator)
|
|
56
|
+
# - :reference [String] File location reference for debugging
|
|
57
|
+
def extract(html, dest, file_path)
|
|
58
|
+
entries = []
|
|
59
|
+
|
|
60
|
+
doc = Nokogiri::HTML(html)
|
|
61
|
+
walk_dom(doc.root, file_path, entries, dest)
|
|
62
|
+
|
|
63
|
+
entries.uniq { |e| e[:msgid] }
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def walk_dom(node, file_path, entries, dest)
|
|
69
|
+
return if node.nil? || should_exclude_element?(node)
|
|
70
|
+
|
|
71
|
+
process_text_content(node, file_path, entries, dest)
|
|
72
|
+
process_attributes(node, file_path, entries, dest)
|
|
73
|
+
process_children(node, file_path, entries, dest)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def process_text_content(node, file_path, entries, dest)
|
|
77
|
+
entry = DomTextExtractor.extract(node, file_path, dest)
|
|
78
|
+
entries << entry if entry
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def process_attributes(node, file_path, entries, dest)
|
|
82
|
+
attr_entries = DomAttributeExtractor.extract(node, file_path, dest, @translatable_attrs)
|
|
83
|
+
entries.concat(attr_entries)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def process_children(node, file_path, entries, dest)
|
|
87
|
+
node.children.each { |child| walk_dom(child, file_path, entries, dest) }
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def should_exclude_element?(node)
|
|
91
|
+
return false unless node.element?
|
|
92
|
+
|
|
93
|
+
@exclude_selectors.any? { |selector| node.matches?(selector) }
|
|
94
|
+
rescue Nokogiri::CSS::SyntaxError => e
|
|
95
|
+
Jekyll.logger.warn "Localization", "CSS selector syntax error: #{e.message}"
|
|
96
|
+
false
|
|
97
|
+
rescue StandardError => e
|
|
98
|
+
Jekyll.logger.warn "Localization", "Selector matching error (continuing): #{e.message}"
|
|
99
|
+
false
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Jekyll
|
|
4
|
+
module L10n
|
|
5
|
+
# Logs extraction process errors and summary statistics.
|
|
6
|
+
#
|
|
7
|
+
# ExtractionLogger provides centralized logging for the extraction pipeline,
|
|
8
|
+
# recording errors during extraction and final summary statistics including
|
|
9
|
+
# file count, string count, PO file creation count, and elapsed time.
|
|
10
|
+
#
|
|
11
|
+
# Key responsibilities:
|
|
12
|
+
# * Log extraction errors with file context
|
|
13
|
+
# * Log extraction summary statistics
|
|
14
|
+
# * Format statistics output for user visibility
|
|
15
|
+
#
|
|
16
|
+
# @see Jekyll::L10n::Extractor for extraction workflow context
|
|
17
|
+
# @see Jekyll::L10n::CompendiumTranslator for automated translation logging
|
|
18
|
+
class ExtractionLogger
|
|
19
|
+
# Log an extraction error.
|
|
20
|
+
#
|
|
21
|
+
# @param file_path [String] Path to file where error occurred
|
|
22
|
+
# @param error [StandardError] The error that occurred
|
|
23
|
+
# @return [void]
|
|
24
|
+
def self.log_error(file_path, error)
|
|
25
|
+
Jekyll.logger.error "Localization", "Error extracting from #{file_path}: #{error.message}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Log extraction completion summary.
|
|
29
|
+
#
|
|
30
|
+
# Logs final statistics about the extraction process including total files
|
|
31
|
+
# processed, strings extracted, PO files created/updated, and elapsed time.
|
|
32
|
+
#
|
|
33
|
+
# @param stats [Hash] Statistics hash with keys:
|
|
34
|
+
# - :files_processed [Integer]
|
|
35
|
+
# - :strings_extracted [Integer]
|
|
36
|
+
# - :po_files_created [Integer]
|
|
37
|
+
# @param elapsed [Float] Time elapsed in seconds
|
|
38
|
+
# @return [void]
|
|
39
|
+
def self.log_summary(stats, elapsed)
|
|
40
|
+
Jekyll.logger.info "Localization", "Extraction complete:"
|
|
41
|
+
Jekyll.logger.info "Localization", " Files processed: #{stats[:files_processed]}"
|
|
42
|
+
Jekyll.logger.info "Localization", " Strings extracted: #{stats[:strings_extracted]}"
|
|
43
|
+
Jekyll.logger.info "Localization", " PO files created/updated: #{stats[:po_files_created]}"
|
|
44
|
+
Jekyll.logger.info "Localization", " Time: #{elapsed.round(2)}s"
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../po_file/manager"
|
|
4
|
+
require_relative "../utils/page_locales_config"
|
|
5
|
+
require_relative "../utils/site_config_accessor"
|
|
6
|
+
require_relative "../utils/logger_formatter"
|
|
7
|
+
require_relative "compendium_translator"
|
|
8
|
+
require_relative "compendium_merger"
|
|
9
|
+
|
|
10
|
+
module Jekyll
|
|
11
|
+
module L10n
|
|
12
|
+
# Saves extraction results to PO files and optionally translates compendia.
|
|
13
|
+
#
|
|
14
|
+
# ExtractionResultSaver orchestrates the post-extraction process: writing
|
|
15
|
+
# page-specific PO files for all configured locales, merging with existing
|
|
16
|
+
# translations, optionally updating compendia, and triggering automatic
|
|
17
|
+
# translation via LibreTranslate if enabled.
|
|
18
|
+
#
|
|
19
|
+
# Key responsibilities:
|
|
20
|
+
# * Save page-specific PO files for each locale
|
|
21
|
+
# * Merge new entries with existing translations
|
|
22
|
+
# * Update compendium files from page-specific extractions
|
|
23
|
+
# * Trigger automatic translation of compendia
|
|
24
|
+
# * Report extraction statistics
|
|
25
|
+
#
|
|
26
|
+
# @example
|
|
27
|
+
# saver = ExtractionResultSaver.new(site)
|
|
28
|
+
# stats = saver.save_results(config, entries, 'docs/index.html')
|
|
29
|
+
# saver.translate_compendia(config) if config.libretranslate_enabled?
|
|
30
|
+
class ExtractionResultSaver
|
|
31
|
+
# Initialize a new ExtractionResultSaver.
|
|
32
|
+
#
|
|
33
|
+
# @param site [Jekyll::Site] Jekyll site object
|
|
34
|
+
def initialize(site)
|
|
35
|
+
@site = site
|
|
36
|
+
with_locales_data = SiteConfigAccessor.extract_locales_data(@site)
|
|
37
|
+
@site_config = PageLocalesConfig.new({ "with_locales_data" => with_locales_data })
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Save extraction results to PO files.
|
|
41
|
+
#
|
|
42
|
+
# Saves extracted strings to PO files for each configured locale, merging with
|
|
43
|
+
# existing translations to preserve manual edits. Optionally merges new entries
|
|
44
|
+
# into compendium files. Returns statistics about the save operation.
|
|
45
|
+
#
|
|
46
|
+
# @param config [PageLocalesConfig] Localization configuration
|
|
47
|
+
# @param entries [Array<Hash>] Array of extracted entries with :msgid, :msgstr, :reference
|
|
48
|
+
# @param page_path [String] Page path for file organization (e.g., 'docs/index.html')
|
|
49
|
+
# @return [Hash] Statistics hash with keys:
|
|
50
|
+
# - :files_processed [Integer] Number of pages processed (always 1)
|
|
51
|
+
# - :strings_extracted [Integer] Number of extracted strings
|
|
52
|
+
# - :po_files_created [Integer] Number of PO files created/updated
|
|
53
|
+
def save_results(config, entries, page_path)
|
|
54
|
+
LoggerFormatter.debug_if_enabled("ExtractionResultSaver", "Processing page: #{page_path}")
|
|
55
|
+
|
|
56
|
+
po_manager = PoFileManager.new(@site, config.locales_dir)
|
|
57
|
+
po_files_created = save_po_files(po_manager, config, entries, page_path)
|
|
58
|
+
|
|
59
|
+
{
|
|
60
|
+
:files_processed => 1,
|
|
61
|
+
:strings_extracted => entries.length,
|
|
62
|
+
:po_files_created => po_files_created,
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Translate compendia using LibreTranslate.
|
|
67
|
+
#
|
|
68
|
+
# If LibreTranslate is enabled in config, translates all empty entries in
|
|
69
|
+
# compendium files for configured locales. Called after extraction and
|
|
70
|
+
# compendium merging to fill in translations automatically.
|
|
71
|
+
#
|
|
72
|
+
# @param config [PageLocalesConfig] Localization configuration with LibreTranslate settings
|
|
73
|
+
# @return [void]
|
|
74
|
+
def translate_compendia(config)
|
|
75
|
+
CompendiumTranslator.new(@site).translate_compendia(config)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
def save_po_files(po_manager, config, entries, page_path)
|
|
81
|
+
po_files_created = 0
|
|
82
|
+
|
|
83
|
+
config.locales.each do |locale|
|
|
84
|
+
po_files_created += 1 if po_manager.save_po_file(locale, entries, :page_path => page_path)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
CompendiumMerger.new(@site).merge_compendia(po_manager, config) if config.update_compendium?
|
|
88
|
+
|
|
89
|
+
po_files_created
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
private :save_po_files
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|