kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'document'
|
|
4
|
+
require_relative 'location'
|
|
5
|
+
|
|
6
|
+
module Kotoshu
|
|
7
|
+
module Documents
|
|
8
|
+
# AsciiDoc document implementation.
|
|
9
|
+
#
|
|
10
|
+
# Handles AsciiDoc files with AST parsing for structured navigation.
|
|
11
|
+
#
|
|
12
|
+
# @example Creating an asciidoc document
|
|
13
|
+
# doc = AsciidocDocument.new("= Title\n\nParagraph text")
|
|
14
|
+
# doc.text_nodes.each { |node| puts node.text }
|
|
15
|
+
class AsciidocDocument < Document
|
|
16
|
+
require 'asciidoctor' if ENV['KOTOSHU_REQUIRE_ASCIIDOC']
|
|
17
|
+
|
|
18
|
+
# Create a new asciidoc document.
|
|
19
|
+
#
|
|
20
|
+
# @param content [String] The document content
|
|
21
|
+
# @param format [Symbol] Document format (must be :asciidoc)
|
|
22
|
+
# @param language_code [String] Language code
|
|
23
|
+
def initialize(content, format: :asciidoc, language_code: 'en')
|
|
24
|
+
raise ArgumentError, "Format must be :asciidoc" unless format == :asciidoc
|
|
25
|
+
|
|
26
|
+
super(content, format: format, language_code: language_code)
|
|
27
|
+
@parsed = false
|
|
28
|
+
@ast = nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Parse the asciidoc document into an AST.
|
|
32
|
+
#
|
|
33
|
+
# @return [Array<Asciidoctor::AbstractBlock>] The parsed AST
|
|
34
|
+
def parse
|
|
35
|
+
return @ast if @parsed
|
|
36
|
+
|
|
37
|
+
begin
|
|
38
|
+
require 'asciidoctor'
|
|
39
|
+
rescue LoadError
|
|
40
|
+
raise "Asciidoctor gem not available. Add 'asciidoctor' to Gemfile"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Parse with Asciidoctor
|
|
44
|
+
doc = Asciidoctor.load(content, parse: false, header_footer: false)
|
|
45
|
+
@ast = doc.blocks
|
|
46
|
+
@parsed = true
|
|
47
|
+
|
|
48
|
+
@ast
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Get all text nodes for spell checking.
|
|
52
|
+
#
|
|
53
|
+
# Extracts text from the AST, skipping code blocks and source listings.
|
|
54
|
+
#
|
|
55
|
+
# @return [Array<TextNode>] Text nodes in the document
|
|
56
|
+
def text_nodes
|
|
57
|
+
extract_text_nodes
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Get node at a specific path in the AST.
|
|
61
|
+
#
|
|
62
|
+
# @param path [Array] Node path (e.g., [:section, 0, :paragraph, 2])
|
|
63
|
+
# @return [Object, nil] The node or nil
|
|
64
|
+
def get_node(path)
|
|
65
|
+
parse unless @parsed
|
|
66
|
+
|
|
67
|
+
navigate_ast(@ast, path)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Get context around a location.
|
|
71
|
+
#
|
|
72
|
+
# For asciidoc, navigates the AST to find surrounding context.
|
|
73
|
+
#
|
|
74
|
+
# @param location [Location] The error location
|
|
75
|
+
# @param window [Integer] Number of sibling elements before/after
|
|
76
|
+
# @return [Models::Context] Context object
|
|
77
|
+
def context_for(location, window: 2)
|
|
78
|
+
return plain_text_context(location, window: 5) if location.line_column?
|
|
79
|
+
|
|
80
|
+
parse unless @parsed
|
|
81
|
+
|
|
82
|
+
# For node-based locations, find parent and siblings
|
|
83
|
+
parent_path = location.node_path[0..-2]
|
|
84
|
+
current_type = location.node_path.last
|
|
85
|
+
|
|
86
|
+
parent = navigate_ast(@ast, parent_path)
|
|
87
|
+
return Models::Context.new(before: "", current: "", after: "", location: location, window: window) unless parent
|
|
88
|
+
|
|
89
|
+
# Find siblings around current element
|
|
90
|
+
siblings = extract_siblings(parent)
|
|
91
|
+
current_idx = siblings.find_index { |s| node_type(s) == current_type }
|
|
92
|
+
|
|
93
|
+
return Models::Context.new(before: "", current: "", after: "", location: location, window: window) unless current_idx
|
|
94
|
+
|
|
95
|
+
before_sibs = siblings[[0, current_idx - window].max..current_idx - 1]
|
|
96
|
+
after_sibs = siblings[(current_idx + 1)..(current_idx + window)]
|
|
97
|
+
|
|
98
|
+
before = before_sibs.map { |s| text_from_node(s) }.join("\n")
|
|
99
|
+
current = text_from_node(parent)
|
|
100
|
+
after = after_sibs.map { |s| text_from_node(s) }.join("\n")
|
|
101
|
+
|
|
102
|
+
Models::Context.new(
|
|
103
|
+
before: before,
|
|
104
|
+
current: current,
|
|
105
|
+
after: after,
|
|
106
|
+
location: location,
|
|
107
|
+
window: window
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Replace text at a specific location.
|
|
112
|
+
#
|
|
113
|
+
# Navigates the AST to find the text node and replaces it,
|
|
114
|
+
# then regenerates asciidoc.
|
|
115
|
+
#
|
|
116
|
+
# @param location [Location] The location to replace
|
|
117
|
+
# @param new_text [String] The new text
|
|
118
|
+
# @return [AsciidocDocument] New document with replacement
|
|
119
|
+
def replace_node(location, new_text)
|
|
120
|
+
parse unless @parsed
|
|
121
|
+
|
|
122
|
+
# Navigate to the node and replace its text
|
|
123
|
+
modified_ast = replace_in_ast(@ast, location.node_path, new_text)
|
|
124
|
+
|
|
125
|
+
# Regenerate asciidoc from modified AST
|
|
126
|
+
begin
|
|
127
|
+
require 'asciidoctor'
|
|
128
|
+
new_content = convert_ast_to_asciidoc(modified_ast)
|
|
129
|
+
rescue LoadError
|
|
130
|
+
raise "Asciidoctor gem not available. Add 'asciidoctor' to Gemfile"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
AsciidocDocument.new(new_content, @format, @language_code)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Apply corrections and return new document.
|
|
137
|
+
#
|
|
138
|
+
# @param corrections [Array<Models::SemanticError>] Errors to fix
|
|
139
|
+
# @return [AsciidocDocument] New document with corrections
|
|
140
|
+
def apply(corrections)
|
|
141
|
+
return self if corrections.empty?
|
|
142
|
+
|
|
143
|
+
# Apply corrections one by one
|
|
144
|
+
result = self
|
|
145
|
+
corrections.each do |error|
|
|
146
|
+
suggestion = error.recommended_suggestion
|
|
147
|
+
result = result.replace_node(error.location, suggestion.word)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
result
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Document name for display.
|
|
154
|
+
#
|
|
155
|
+
# @return [String] Document name
|
|
156
|
+
def name
|
|
157
|
+
"asciidoc"
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
private
|
|
161
|
+
|
|
162
|
+
# Extract text nodes from AST.
|
|
163
|
+
#
|
|
164
|
+
# @return [Array<TextNode>] Text nodes
|
|
165
|
+
def extract_text_nodes
|
|
166
|
+
parse unless @parsed
|
|
167
|
+
extract_from_ast(@ast)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Extract text nodes recursively from AST.
|
|
171
|
+
#
|
|
172
|
+
# @param blocks [Array<Asciidoctor::AbstractBlock>] Blocks or nodes
|
|
173
|
+
# @param path [Array] Current path
|
|
174
|
+
# @return [Array<TextNode>] Text nodes
|
|
175
|
+
def extract_from_ast(blocks, path: [])
|
|
176
|
+
nodes = []
|
|
177
|
+
|
|
178
|
+
return nodes unless blocks&.is_a?(Array)
|
|
179
|
+
|
|
180
|
+
blocks.each_with_index do |block, idx|
|
|
181
|
+
current_path = path + [node_type_sym(block), idx]
|
|
182
|
+
|
|
183
|
+
case block
|
|
184
|
+
when Asciidoctor::Block
|
|
185
|
+
# Skip code blocks and source listings
|
|
186
|
+
next if block.context == :listing || block.context == :literal
|
|
187
|
+
|
|
188
|
+
# Extract text from paragraph
|
|
189
|
+
if block.context == :paragraph
|
|
190
|
+
text = block.source&.strip
|
|
191
|
+
if text && !text.empty?
|
|
192
|
+
nodes << TextNode.new(
|
|
193
|
+
text,
|
|
194
|
+
location: Location.for_text_node(current_path, start_offset: 0, length: text.length),
|
|
195
|
+
node_path: current_path
|
|
196
|
+
)
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Extract inline text from other blocks
|
|
201
|
+
if block.content
|
|
202
|
+
inline_text = extract_inline_content(block.content)
|
|
203
|
+
if inline_text && !inline_text.empty?
|
|
204
|
+
nodes << TextNode.new(
|
|
205
|
+
inline_text,
|
|
206
|
+
location: Location.for_text_node(current_path, start_offset: 0, length: inline_text.length),
|
|
207
|
+
node_path: current_path
|
|
208
|
+
)
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Recurse into nested blocks
|
|
213
|
+
nodes.concat(extract_from_ast(block.blocks, path: current_path)) if block.blocks&.any?
|
|
214
|
+
|
|
215
|
+
when Asciidoctor::Section
|
|
216
|
+
# Extract title from section
|
|
217
|
+
if block.title
|
|
218
|
+
nodes << TextNode.new(
|
|
219
|
+
block.title,
|
|
220
|
+
location: Location.for_text_node(current_path + [:title], start_offset: 0, length: block.title.length),
|
|
221
|
+
node_path: current_path + [:title]
|
|
222
|
+
)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Recurse into section blocks
|
|
226
|
+
nodes.concat(extract_from_ast(block.blocks, path: current_path))
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
nodes
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Extract inline content from a block.
|
|
234
|
+
#
|
|
235
|
+
# @param content [String] Block content
|
|
236
|
+
# @return [String] Extracted text
|
|
237
|
+
def extract_inline_content(content)
|
|
238
|
+
return "" unless content
|
|
239
|
+
|
|
240
|
+
# For now, just return the content as-is
|
|
241
|
+
# In full implementation, would parse inline formatting (bold, italic, links, etc.)
|
|
242
|
+
content.to_s.strip
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Navigate AST to find node at path.
|
|
246
|
+
#
|
|
247
|
+
# @param ast [Array] The AST
|
|
248
|
+
# @param path [Array] Node path
|
|
249
|
+
# @return [Object, nil] The node or nil
|
|
250
|
+
def navigate_ast(ast, path)
|
|
251
|
+
return nil unless path&.is_a?(Array) || path&.empty?
|
|
252
|
+
|
|
253
|
+
current = ast
|
|
254
|
+
path.each do |element|
|
|
255
|
+
case element
|
|
256
|
+
when Integer
|
|
257
|
+
# Array index
|
|
258
|
+
return nil unless current.is_a?(Array)
|
|
259
|
+
return nil if element >= current.size
|
|
260
|
+
current = current[element]
|
|
261
|
+
when Symbol, String
|
|
262
|
+
# Property access
|
|
263
|
+
if element == :title && current.respond_to?(:title)
|
|
264
|
+
current = current.title
|
|
265
|
+
else
|
|
266
|
+
# Navigate by context type
|
|
267
|
+
current = current.find { |node| node_type_sym(node) == element.to_sym } if current.is_a?(Array)
|
|
268
|
+
end
|
|
269
|
+
else
|
|
270
|
+
return nil
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
current
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# Extract sibling nodes from a parent node.
|
|
278
|
+
#
|
|
279
|
+
# @param parent [Object] Parent node
|
|
280
|
+
# @return [Array] Sibling nodes
|
|
281
|
+
def extract_siblings(parent)
|
|
282
|
+
case parent
|
|
283
|
+
when Asciidoctor::Section
|
|
284
|
+
parent.blocks || []
|
|
285
|
+
when Array
|
|
286
|
+
parent
|
|
287
|
+
else
|
|
288
|
+
[]
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Extract text content from a node.
|
|
293
|
+
#
|
|
294
|
+
# @param node [Object] AST node
|
|
295
|
+
# @return [String] Text content
|
|
296
|
+
def text_from_node(node)
|
|
297
|
+
case node
|
|
298
|
+
when Asciidoctor::Block
|
|
299
|
+
node.source || ""
|
|
300
|
+
when Asciidoctor::Section
|
|
301
|
+
node.title || ""
|
|
302
|
+
when String
|
|
303
|
+
node
|
|
304
|
+
else
|
|
305
|
+
""
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Get the node type symbol.
|
|
310
|
+
#
|
|
311
|
+
# @param node [Object] AST node
|
|
312
|
+
# @return [Symbol] Node type
|
|
313
|
+
def node_type_sym(node)
|
|
314
|
+
return :section if node.is_a?(Asciidoctor::Section)
|
|
315
|
+
return :paragraph if node.is_a?(Asciidoctor::Block) && node.context == :paragraph
|
|
316
|
+
return :listing if node.is_a?(Asciidoctor::Block) && node.context == :listing
|
|
317
|
+
:block
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Get the node type.
|
|
321
|
+
#
|
|
322
|
+
# @param node [Object] AST node
|
|
323
|
+
# @return [Symbol] Node type
|
|
324
|
+
def node_type(node)
|
|
325
|
+
node_type_sym(node)
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
# Replace text in AST at a specific path.
|
|
329
|
+
#
|
|
330
|
+
# @param ast [Array] The AST
|
|
331
|
+
# @param path [Array] Node path to the text node
|
|
332
|
+
# @param new_text [String] The replacement text
|
|
333
|
+
# @return [Array] Modified AST
|
|
334
|
+
def replace_in_ast(ast, path, new_text)
|
|
335
|
+
return ast if path.empty?
|
|
336
|
+
|
|
337
|
+
# Clone the AST (shallow copy for now)
|
|
338
|
+
modified_ast = ast.dup
|
|
339
|
+
|
|
340
|
+
# Navigate to the target node
|
|
341
|
+
if path.length == 1
|
|
342
|
+
# Direct child replacement
|
|
343
|
+
idx = path.first
|
|
344
|
+
return modified_ast unless idx.is_a?(Integer)
|
|
345
|
+
|
|
346
|
+
if modified_ast[idx].is_a?(Asciidoctor::Block)
|
|
347
|
+
# Replace block source (this creates a new block)
|
|
348
|
+
old_block = modified_ast[idx]
|
|
349
|
+
new_block = Asciidoctor::Block.new(
|
|
350
|
+
old_block.parent,
|
|
351
|
+
old_block.context,
|
|
352
|
+
source: new_text,
|
|
353
|
+
attributes: old_block.attributes
|
|
354
|
+
)
|
|
355
|
+
modified_ast[idx] = new_block
|
|
356
|
+
end
|
|
357
|
+
else
|
|
358
|
+
# Navigate deeper
|
|
359
|
+
first_elem = path.first
|
|
360
|
+
rest_path = path[1..-1]
|
|
361
|
+
|
|
362
|
+
if first_elem.is_a?(Integer) && modified_ast[first_elem]
|
|
363
|
+
if modified_ast[first_elem].is_a?(Asciidoctor::Section)
|
|
364
|
+
# Recurse into section blocks
|
|
365
|
+
new_blocks = replace_in_ast(modified_ast[first_elem].blocks, rest_path, new_text)
|
|
366
|
+
modified_ast[first_elem].instance_variable_set(:@blocks, new_blocks)
|
|
367
|
+
elsif modified_ast[first_elem].is_a?(Asciidoctor::Block)
|
|
368
|
+
# Recurse into nested blocks
|
|
369
|
+
new_blocks = replace_in_ast(modified_ast[first_elem].blocks, rest_path, new_text)
|
|
370
|
+
modified_ast[first_elem].instance_variable_set(:@blocks, new_blocks)
|
|
371
|
+
end
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
modified_ast
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Convert AST back to AsciiDoc format.
|
|
379
|
+
#
|
|
380
|
+
# @param ast [Array] The AST
|
|
381
|
+
# @return [String] AsciiDoc source
|
|
382
|
+
def convert_ast_to_asciidoc(ast)
|
|
383
|
+
lines = []
|
|
384
|
+
|
|
385
|
+
ast.each do |node|
|
|
386
|
+
case node
|
|
387
|
+
when Asciidoctor::Section
|
|
388
|
+
# Section title
|
|
389
|
+
level = "=" * (node.level + 1)
|
|
390
|
+
lines << "#{level} #{node.title}"
|
|
391
|
+
lines << ""
|
|
392
|
+
|
|
393
|
+
# Section content
|
|
394
|
+
lines << convert_ast_to_asciidoc(node.blocks)
|
|
395
|
+
|
|
396
|
+
when Asciidoctor::Block
|
|
397
|
+
case node.context
|
|
398
|
+
when :paragraph
|
|
399
|
+
lines << node.source
|
|
400
|
+
lines << ""
|
|
401
|
+
when :listing
|
|
402
|
+
lines << "----"
|
|
403
|
+
lines << node.source
|
|
404
|
+
lines << "----"
|
|
405
|
+
lines << ""
|
|
406
|
+
else
|
|
407
|
+
lines << node.source.to_s
|
|
408
|
+
lines << ""
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
lines.join("\n")
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
# Get plain text context for line/column locations.
|
|
417
|
+
#
|
|
418
|
+
# Fallback for line/column locations in structured documents.
|
|
419
|
+
#
|
|
420
|
+
# @param location [Location] The line/column location
|
|
421
|
+
# @param window [Integer] Number of lines before/after
|
|
422
|
+
# @return [Models::Context] Context object
|
|
423
|
+
def plain_text_context(location, window: 5)
|
|
424
|
+
start_line = [0, location.line - window - 1].max
|
|
425
|
+
end_line = [@lines.size - 1, location.line + window - 1].min
|
|
426
|
+
|
|
427
|
+
before = @lines[start_line...(location.line - 1)].join("\n")
|
|
428
|
+
current = @lines[location.line - 1]
|
|
429
|
+
after = @lines[(location.line + 1)..end_line].join("\n")
|
|
430
|
+
|
|
431
|
+
Models::Context.new(
|
|
432
|
+
before: before,
|
|
433
|
+
current: current,
|
|
434
|
+
after: after,
|
|
435
|
+
location: location,
|
|
436
|
+
window: window
|
|
437
|
+
)
|
|
438
|
+
end
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
end
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'location'
|
|
4
|
+
require_relative '../models/context'
|
|
5
|
+
|
|
6
|
+
module Kotoshu
|
|
7
|
+
module Documents
|
|
8
|
+
# Text node abstraction for structured documents.
|
|
9
|
+
#
|
|
10
|
+
# Represents a span of text in a document with location information.
|
|
11
|
+
# Used for spell checking individual text elements in structured formats.
|
|
12
|
+
#
|
|
13
|
+
# @example Creating a text node
|
|
14
|
+
# node = TextNode.new("Hello world", location: Location.new(line: 5, column: 0))
|
|
15
|
+
# node.text # => "Hello world"
|
|
16
|
+
class TextNode
|
|
17
|
+
attr_reader :text, :location, :node_path
|
|
18
|
+
|
|
19
|
+
# Create a new text node.
|
|
20
|
+
#
|
|
21
|
+
# @param text [String] The text content
|
|
22
|
+
# @param location [Location] Location of the text
|
|
23
|
+
# @param node_path [Array, nil] Path in document AST
|
|
24
|
+
def initialize(text, location:, node_path: nil)
|
|
25
|
+
@text = text
|
|
26
|
+
@location = location
|
|
27
|
+
@node_path = node_path
|
|
28
|
+
freeze
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Get words from this text node.
|
|
32
|
+
#
|
|
33
|
+
# @return [Array<String>] Words in the text
|
|
34
|
+
def words
|
|
35
|
+
@text.split
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Check if this equals another text node.
|
|
39
|
+
#
|
|
40
|
+
# @param other [Object] Another object
|
|
41
|
+
# @return [Boolean] True if text and location match
|
|
42
|
+
def ==(other)
|
|
43
|
+
return false unless other.is_a?(TextNode)
|
|
44
|
+
|
|
45
|
+
@text == other.text && @location == other.location
|
|
46
|
+
end
|
|
47
|
+
alias_method :eql?, :==
|
|
48
|
+
|
|
49
|
+
# Hash code for hash table usage.
|
|
50
|
+
#
|
|
51
|
+
# @return [Integer] Hash code
|
|
52
|
+
def hash
|
|
53
|
+
[@text, @location].hash
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# String representation.
|
|
57
|
+
#
|
|
58
|
+
# @return [String] Human-readable representation
|
|
59
|
+
def to_s
|
|
60
|
+
if @location.line_column?
|
|
61
|
+
"#{@location}: #{@text}"
|
|
62
|
+
else
|
|
63
|
+
@text
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
alias_method :inspect, :to_s
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Abstract base class for documents.
|
|
70
|
+
#
|
|
71
|
+
# Provides a unified interface for different document formats:
|
|
72
|
+
# - Plain text
|
|
73
|
+
# - Markdown
|
|
74
|
+
# AsciiDoc
|
|
75
|
+
# Code files (with syntax awareness)
|
|
76
|
+
#
|
|
77
|
+
# Subclasses implement format-specific parsing and context retrieval.
|
|
78
|
+
#
|
|
79
|
+
# @example Plain text document
|
|
80
|
+
# doc = PlainTextDocument.new("Hello world\n")
|
|
81
|
+
# doc.text_nodes.each { |node| puts node.text }
|
|
82
|
+
#
|
|
83
|
+
# @example Markdown document
|
|
84
|
+
# doc = MarkdownDocument.new("# Title\nParagraph text")
|
|
85
|
+
# doc.text_nodes.each { |node| puts node.text }
|
|
86
|
+
class Document
|
|
87
|
+
attr_reader :content, :format, :language_code
|
|
88
|
+
|
|
89
|
+
# Supported document formats
|
|
90
|
+
FORMATS = {
|
|
91
|
+
text: 'Plain Text',
|
|
92
|
+
markdown: 'Markdown',
|
|
93
|
+
asciidoc: 'AsciiDoc',
|
|
94
|
+
code: 'Code'
|
|
95
|
+
}.freeze
|
|
96
|
+
|
|
97
|
+
# Create a new document.
|
|
98
|
+
#
|
|
99
|
+
# @param content [String] The document content
|
|
100
|
+
# @param format [Symbol] Document format (:text, :markdown, :asciidoc, :code)
|
|
101
|
+
# @param language_code [String] ISO 639-1 language code (default: 'en')
|
|
102
|
+
def initialize(content, format: :text, language_code: 'en')
|
|
103
|
+
raise ArgumentError, "Invalid format: #{format}" unless FORMATS.key?(format)
|
|
104
|
+
|
|
105
|
+
@content = content
|
|
106
|
+
@format = format
|
|
107
|
+
@language_code = language_code
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Get all text nodes for spell checking.
|
|
111
|
+
#
|
|
112
|
+
# Subclasses implement format-specific text extraction.
|
|
113
|
+
#
|
|
114
|
+
# @return [Array<TextNode>] Text nodes in the document
|
|
115
|
+
def text_nodes
|
|
116
|
+
raise NotImplementedError, "#{self.class} must implement #text_nodes"
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Get node at a specific path (for structured formats).
|
|
120
|
+
#
|
|
121
|
+
# @param path [Array] Node path (e.g., [:paragraph, 3, :text])
|
|
122
|
+
# @return [Object, nil] The node object or nil
|
|
123
|
+
def get_node(path)
|
|
124
|
+
raise NotImplementedError, "#{self.class} must implement #get_node"
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Replace text at a specific location.
|
|
128
|
+
#
|
|
129
|
+
# @param location [Location] The location to replace
|
|
130
|
+
# @param new_text [String] The new text
|
|
131
|
+
# @return [Document] New document with replacement applied
|
|
132
|
+
def replace_node(location, new_text)
|
|
133
|
+
raise NotImplementedError, "#{self.class} must implement #replace_node"
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Get context around a specific location.
|
|
137
|
+
#
|
|
138
|
+
# @param location [Location] The error location
|
|
139
|
+
# @param window [Integer] Number of lines before/after (default: 5)
|
|
140
|
+
# @return [Models::Context] Context object
|
|
141
|
+
def context_for(location, window: 5)
|
|
142
|
+
raise NotImplementedError, "#{self.class} must implement #context_for"
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Apply corrections and return new document.
|
|
146
|
+
#
|
|
147
|
+
# @param corrections [Array<Models::SemanticError>] Errors to fix
|
|
148
|
+
# @return [Document] New document with corrections applied
|
|
149
|
+
def apply(corrections)
|
|
150
|
+
raise NotImplementedError, "#{self.class} must implement #apply"
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Get word count.
|
|
154
|
+
#
|
|
155
|
+
# @return [Integer] Total word count
|
|
156
|
+
def word_count
|
|
157
|
+
@content.split(/\s+/).size
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Get line count.
|
|
161
|
+
#
|
|
162
|
+
# @return [Integer] Total line count
|
|
163
|
+
def line_count
|
|
164
|
+
@content.lines.size
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Get document name (for display).
|
|
168
|
+
#
|
|
169
|
+
# @return [String] Document name or identifier
|
|
170
|
+
def name
|
|
171
|
+
"document"
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Detect format from content.
|
|
175
|
+
#
|
|
176
|
+
# @param content [String] The document content
|
|
177
|
+
# @return [Symbol] Detected format
|
|
178
|
+
def self.detect_format(content)
|
|
179
|
+
return :markdown if content.start_with?('#')
|
|
180
|
+
return :code if content.end_with?('.')
|
|
181
|
+
:text
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Create document from file.
|
|
185
|
+
#
|
|
186
|
+
# @param path [String] Path to the file
|
|
187
|
+
# @return [Document] Document instance
|
|
188
|
+
def self.from_file(path)
|
|
189
|
+
content = File.read(path, encoding: 'UTF-8')
|
|
190
|
+
format = detect_format(content)
|
|
191
|
+
language_code = detect_language_from_path(path)
|
|
192
|
+
|
|
193
|
+
case format
|
|
194
|
+
when :markdown
|
|
195
|
+
MarkdownDocument.new(content, language_code: language_code)
|
|
196
|
+
when :asciidoc
|
|
197
|
+
AsciidocDocument.new(content, language_code: language_code)
|
|
198
|
+
else
|
|
199
|
+
PlainTextDocument.new(content, language_code: language_code)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Create document from string with format detection.
|
|
204
|
+
#
|
|
205
|
+
# @param content [String] The document content
|
|
206
|
+
# @param language_code [String] Language code (optional)
|
|
207
|
+
# @return [Document] Document instance
|
|
208
|
+
def self.from_string(content, language_code: 'en')
|
|
209
|
+
format = detect_format(content)
|
|
210
|
+
new(content, format: format, language_code: language_code)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
private
|
|
214
|
+
|
|
215
|
+
# Detect language code from file path.
|
|
216
|
+
#
|
|
217
|
+
# @param path [String] File path
|
|
218
|
+
# @return [String] Language code
|
|
219
|
+
def self.detect_language_from_path(path)
|
|
220
|
+
# Extract from path like "README.en.md" or "document.de.txt"
|
|
221
|
+
if path =~ /\.([a-z]{2})\./i
|
|
222
|
+
Regexp.last_match(1)
|
|
223
|
+
else
|
|
224
|
+
'en'
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|