kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,441 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'document'
4
+ require_relative 'location'
5
+
6
+ module Kotoshu
7
+ module Documents
8
+ # AsciiDoc document implementation.
9
+ #
10
+ # Handles AsciiDoc files with AST parsing for structured navigation.
11
+ #
12
+ # @example Creating an asciidoc document
13
+ # doc = AsciidocDocument.new("= Title\n\nParagraph text")
14
+ # doc.text_nodes.each { |node| puts node.text }
15
+ class AsciidocDocument < Document
16
+ require 'asciidoctor' if ENV['KOTOSHU_REQUIRE_ASCIIDOC']
17
+
18
+ # Create a new asciidoc document.
19
+ #
20
+ # @param content [String] The document content
21
+ # @param format [Symbol] Document format (must be :asciidoc)
22
+ # @param language_code [String] Language code
23
+ def initialize(content, format: :asciidoc, language_code: 'en')
24
+ raise ArgumentError, "Format must be :asciidoc" unless format == :asciidoc
25
+
26
+ super(content, format: format, language_code: language_code)
27
+ @parsed = false
28
+ @ast = nil
29
+ end
30
+
31
+ # Parse the asciidoc document into an AST.
32
+ #
33
+ # @return [Array<Asciidoctor::AbstractBlock>] The parsed AST
34
+ def parse
35
+ return @ast if @parsed
36
+
37
+ begin
38
+ require 'asciidoctor'
39
+ rescue LoadError
40
+ raise "Asciidoctor gem not available. Add 'asciidoctor' to Gemfile"
41
+ end
42
+
43
+ # Parse with Asciidoctor
44
+ doc = Asciidoctor.load(content, parse: false, header_footer: false)
45
+ @ast = doc.blocks
46
+ @parsed = true
47
+
48
+ @ast
49
+ end
50
+
51
+ # Get all text nodes for spell checking.
52
+ #
53
+ # Extracts text from the AST, skipping code blocks and source listings.
54
+ #
55
+ # @return [Array<TextNode>] Text nodes in the document
56
+ def text_nodes
57
+ extract_text_nodes
58
+ end
59
+
60
+ # Get node at a specific path in the AST.
61
+ #
62
+ # @param path [Array] Node path (e.g., [:section, 0, :paragraph, 2])
63
+ # @return [Object, nil] The node or nil
64
+ def get_node(path)
65
+ parse unless @parsed
66
+
67
+ navigate_ast(@ast, path)
68
+ end
69
+
70
+ # Get context around a location.
71
+ #
72
+ # For asciidoc, navigates the AST to find surrounding context.
73
+ #
74
+ # @param location [Location] The error location
75
+ # @param window [Integer] Number of sibling elements before/after
76
+ # @return [Models::Context] Context object
77
+ def context_for(location, window: 2)
78
+ return plain_text_context(location, window: 5) if location.line_column?
79
+
80
+ parse unless @parsed
81
+
82
+ # For node-based locations, find parent and siblings
83
+ parent_path = location.node_path[0..-2]
84
+ current_type = location.node_path.last
85
+
86
+ parent = navigate_ast(@ast, parent_path)
87
+ return Models::Context.new(before: "", current: "", after: "", location: location, window: window) unless parent
88
+
89
+ # Find siblings around current element
90
+ siblings = extract_siblings(parent)
91
+ current_idx = siblings.find_index { |s| node_type(s) == current_type }
92
+
93
+ return Models::Context.new(before: "", current: "", after: "", location: location, window: window) unless current_idx
94
+
95
+ before_sibs = siblings[[0, current_idx - window].max..current_idx - 1]
96
+ after_sibs = siblings[(current_idx + 1)..(current_idx + window)]
97
+
98
+ before = before_sibs.map { |s| text_from_node(s) }.join("\n")
99
+ current = text_from_node(parent)
100
+ after = after_sibs.map { |s| text_from_node(s) }.join("\n")
101
+
102
+ Models::Context.new(
103
+ before: before,
104
+ current: current,
105
+ after: after,
106
+ location: location,
107
+ window: window
108
+ )
109
+ end
110
+
111
+ # Replace text at a specific location.
112
+ #
113
+ # Navigates the AST to find the text node and replaces it,
114
+ # then regenerates asciidoc.
115
+ #
116
+ # @param location [Location] The location to replace
117
+ # @param new_text [String] The new text
118
+ # @return [AsciidocDocument] New document with replacement
119
+ def replace_node(location, new_text)
120
+ parse unless @parsed
121
+
122
+ # Navigate to the node and replace its text
123
+ modified_ast = replace_in_ast(@ast, location.node_path, new_text)
124
+
125
+ # Regenerate asciidoc from modified AST
126
+ begin
127
+ require 'asciidoctor'
128
+ new_content = convert_ast_to_asciidoc(modified_ast)
129
+ rescue LoadError
130
+ raise "Asciidoctor gem not available. Add 'asciidoctor' to Gemfile"
131
+ end
132
+
133
+ AsciidocDocument.new(new_content, @format, @language_code)
134
+ end
135
+
136
+ # Apply corrections and return new document.
137
+ #
138
+ # @param corrections [Array<Models::SemanticError>] Errors to fix
139
+ # @return [AsciidocDocument] New document with corrections
140
+ def apply(corrections)
141
+ return self if corrections.empty?
142
+
143
+ # Apply corrections one by one
144
+ result = self
145
+ corrections.each do |error|
146
+ suggestion = error.recommended_suggestion
147
+ result = result.replace_node(error.location, suggestion.word)
148
+ end
149
+
150
+ result
151
+ end
152
+
153
+ # Document name for display.
154
+ #
155
+ # @return [String] Document name
156
+ def name
157
+ "asciidoc"
158
+ end
159
+
160
+ private
161
+
162
+ # Extract text nodes from AST.
163
+ #
164
+ # @return [Array<TextNode>] Text nodes
165
+ def extract_text_nodes
166
+ parse unless @parsed
167
+ extract_from_ast(@ast)
168
+ end
169
+
170
+ # Extract text nodes recursively from AST.
171
+ #
172
+ # @param blocks [Array<Asciidoctor::AbstractBlock>] Blocks or nodes
173
+ # @param path [Array] Current path
174
+ # @return [Array<TextNode>] Text nodes
175
+ def extract_from_ast(blocks, path: [])
176
+ nodes = []
177
+
178
+ return nodes unless blocks&.is_a?(Array)
179
+
180
+ blocks.each_with_index do |block, idx|
181
+ current_path = path + [node_type_sym(block), idx]
182
+
183
+ case block
184
+ when Asciidoctor::Block
185
+ # Skip code blocks and source listings
186
+ next if block.context == :listing || block.context == :literal
187
+
188
+ # Extract text from paragraph
189
+ if block.context == :paragraph
190
+ text = block.source&.strip
191
+ if text && !text.empty?
192
+ nodes << TextNode.new(
193
+ text,
194
+ location: Location.for_text_node(current_path, start_offset: 0, length: text.length),
195
+ node_path: current_path
196
+ )
197
+ end
198
+ end
199
+
200
+ # Extract inline text from other blocks
201
+ if block.content
202
+ inline_text = extract_inline_content(block.content)
203
+ if inline_text && !inline_text.empty?
204
+ nodes << TextNode.new(
205
+ inline_text,
206
+ location: Location.for_text_node(current_path, start_offset: 0, length: inline_text.length),
207
+ node_path: current_path
208
+ )
209
+ end
210
+ end
211
+
212
+ # Recurse into nested blocks
213
+ nodes.concat(extract_from_ast(block.blocks, path: current_path)) if block.blocks&.any?
214
+
215
+ when Asciidoctor::Section
216
+ # Extract title from section
217
+ if block.title
218
+ nodes << TextNode.new(
219
+ block.title,
220
+ location: Location.for_text_node(current_path + [:title], start_offset: 0, length: block.title.length),
221
+ node_path: current_path + [:title]
222
+ )
223
+ end
224
+
225
+ # Recurse into section blocks
226
+ nodes.concat(extract_from_ast(block.blocks, path: current_path))
227
+ end
228
+ end
229
+
230
+ nodes
231
+ end
232
+
233
+ # Extract inline content from a block.
234
+ #
235
+ # @param content [String] Block content
236
+ # @return [String] Extracted text
237
+ def extract_inline_content(content)
238
+ return "" unless content
239
+
240
+ # For now, just return the content as-is
241
+ # In full implementation, would parse inline formatting (bold, italic, links, etc.)
242
+ content.to_s.strip
243
+ end
244
+
245
+ # Navigate AST to find node at path.
246
+ #
247
+ # @param ast [Array] The AST
248
+ # @param path [Array] Node path
249
+ # @return [Object, nil] The node or nil
250
+ def navigate_ast(ast, path)
251
+ return nil unless path&.is_a?(Array) || path&.empty?
252
+
253
+ current = ast
254
+ path.each do |element|
255
+ case element
256
+ when Integer
257
+ # Array index
258
+ return nil unless current.is_a?(Array)
259
+ return nil if element >= current.size
260
+ current = current[element]
261
+ when Symbol, String
262
+ # Property access
263
+ if element == :title && current.respond_to?(:title)
264
+ current = current.title
265
+ else
266
+ # Navigate by context type
267
+ current = current.find { |node| node_type_sym(node) == element.to_sym } if current.is_a?(Array)
268
+ end
269
+ else
270
+ return nil
271
+ end
272
+ end
273
+
274
+ current
275
+ end
276
+
277
+ # Extract sibling nodes from a parent node.
278
+ #
279
+ # @param parent [Object] Parent node
280
+ # @return [Array] Sibling nodes
281
+ def extract_siblings(parent)
282
+ case parent
283
+ when Asciidoctor::Section
284
+ parent.blocks || []
285
+ when Array
286
+ parent
287
+ else
288
+ []
289
+ end
290
+ end
291
+
292
+ # Extract text content from a node.
293
+ #
294
+ # @param node [Object] AST node
295
+ # @return [String] Text content
296
+ def text_from_node(node)
297
+ case node
298
+ when Asciidoctor::Block
299
+ node.source || ""
300
+ when Asciidoctor::Section
301
+ node.title || ""
302
+ when String
303
+ node
304
+ else
305
+ ""
306
+ end
307
+ end
308
+
309
+ # Get the node type symbol.
310
+ #
311
+ # @param node [Object] AST node
312
+ # @return [Symbol] Node type
313
+ def node_type_sym(node)
314
+ return :section if node.is_a?(Asciidoctor::Section)
315
+ return :paragraph if node.is_a?(Asciidoctor::Block) && node.context == :paragraph
316
+ return :listing if node.is_a?(Asciidoctor::Block) && node.context == :listing
317
+ :block
318
+ end
319
+
320
+ # Get the node type.
321
+ #
322
+ # @param node [Object] AST node
323
+ # @return [Symbol] Node type
324
+ def node_type(node)
325
+ node_type_sym(node)
326
+ end
327
+
328
+ # Replace text in AST at a specific path.
329
+ #
330
+ # @param ast [Array] The AST
331
+ # @param path [Array] Node path to the text node
332
+ # @param new_text [String] The replacement text
333
+ # @return [Array] Modified AST
334
+ def replace_in_ast(ast, path, new_text)
335
+ return ast if path.empty?
336
+
337
+ # Clone the AST (shallow copy for now)
338
+ modified_ast = ast.dup
339
+
340
+ # Navigate to the target node
341
+ if path.length == 1
342
+ # Direct child replacement
343
+ idx = path.first
344
+ return modified_ast unless idx.is_a?(Integer)
345
+
346
+ if modified_ast[idx].is_a?(Asciidoctor::Block)
347
+ # Replace block source (this creates a new block)
348
+ old_block = modified_ast[idx]
349
+ new_block = Asciidoctor::Block.new(
350
+ old_block.parent,
351
+ old_block.context,
352
+ source: new_text,
353
+ attributes: old_block.attributes
354
+ )
355
+ modified_ast[idx] = new_block
356
+ end
357
+ else
358
+ # Navigate deeper
359
+ first_elem = path.first
360
+ rest_path = path[1..-1]
361
+
362
+ if first_elem.is_a?(Integer) && modified_ast[first_elem]
363
+ if modified_ast[first_elem].is_a?(Asciidoctor::Section)
364
+ # Recurse into section blocks
365
+ new_blocks = replace_in_ast(modified_ast[first_elem].blocks, rest_path, new_text)
366
+ modified_ast[first_elem].instance_variable_set(:@blocks, new_blocks)
367
+ elsif modified_ast[first_elem].is_a?(Asciidoctor::Block)
368
+ # Recurse into nested blocks
369
+ new_blocks = replace_in_ast(modified_ast[first_elem].blocks, rest_path, new_text)
370
+ modified_ast[first_elem].instance_variable_set(:@blocks, new_blocks)
371
+ end
372
+ end
373
+ end
374
+
375
+ modified_ast
376
+ end
377
+
378
+ # Convert AST back to AsciiDoc format.
379
+ #
380
+ # @param ast [Array] The AST
381
+ # @return [String] AsciiDoc source
382
+ def convert_ast_to_asciidoc(ast)
383
+ lines = []
384
+
385
+ ast.each do |node|
386
+ case node
387
+ when Asciidoctor::Section
388
+ # Section title
389
+ level = "=" * (node.level + 1)
390
+ lines << "#{level} #{node.title}"
391
+ lines << ""
392
+
393
+ # Section content
394
+ lines << convert_ast_to_asciidoc(node.blocks)
395
+
396
+ when Asciidoctor::Block
397
+ case node.context
398
+ when :paragraph
399
+ lines << node.source
400
+ lines << ""
401
+ when :listing
402
+ lines << "----"
403
+ lines << node.source
404
+ lines << "----"
405
+ lines << ""
406
+ else
407
+ lines << node.source.to_s
408
+ lines << ""
409
+ end
410
+ end
411
+ end
412
+
413
+ lines.join("\n")
414
+ end
415
+
416
+ # Get plain text context for line/column locations.
417
+ #
418
+ # Fallback for line/column locations in structured documents.
419
+ #
420
+ # @param location [Location] The line/column location
421
+ # @param window [Integer] Number of lines before/after
422
+ # @return [Models::Context] Context object
423
+ def plain_text_context(location, window: 5)
424
+ start_line = [0, location.line - window - 1].max
425
+ end_line = [@lines.size - 1, location.line + window - 1].min
426
+
427
+ before = @lines[start_line...(location.line - 1)].join("\n")
428
+ current = @lines[location.line - 1]
429
+ after = @lines[(location.line + 1)..end_line].join("\n")
430
+
431
+ Models::Context.new(
432
+ before: before,
433
+ current: current,
434
+ after: after,
435
+ location: location,
436
+ window: window
437
+ )
438
+ end
439
+ end
440
+ end
441
+ end
@@ -0,0 +1,229 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'location'
4
+ require_relative '../models/context'
5
+
6
+ module Kotoshu
7
+ module Documents
8
+ # Text node abstraction for structured documents.
9
+ #
10
+ # Represents a span of text in a document with location information.
11
+ # Used for spell checking individual text elements in structured formats.
12
+ #
13
+ # @example Creating a text node
14
+ # node = TextNode.new("Hello world", location: Location.new(line: 5, column: 0))
15
+ # node.text # => "Hello world"
16
+ class TextNode
17
+ attr_reader :text, :location, :node_path
18
+
19
+ # Create a new text node.
20
+ #
21
+ # @param text [String] The text content
22
+ # @param location [Location] Location of the text
23
+ # @param node_path [Array, nil] Path in document AST
24
+ def initialize(text, location:, node_path: nil)
25
+ @text = text
26
+ @location = location
27
+ @node_path = node_path
28
+ freeze
29
+ end
30
+
31
+ # Get words from this text node.
32
+ #
33
+ # @return [Array<String>] Words in the text
34
+ def words
35
+ @text.split
36
+ end
37
+
38
+ # Check if this equals another text node.
39
+ #
40
+ # @param other [Object] Another object
41
+ # @return [Boolean] True if text and location match
42
+ def ==(other)
43
+ return false unless other.is_a?(TextNode)
44
+
45
+ @text == other.text && @location == other.location
46
+ end
47
+ alias_method :eql?, :==
48
+
49
+ # Hash code for hash table usage.
50
+ #
51
+ # @return [Integer] Hash code
52
+ def hash
53
+ [@text, @location].hash
54
+ end
55
+
56
+ # String representation.
57
+ #
58
+ # @return [String] Human-readable representation
59
+ def to_s
60
+ if @location.line_column?
61
+ "#{@location}: #{@text}"
62
+ else
63
+ @text
64
+ end
65
+ end
66
+ alias_method :inspect, :to_s
67
+ end
68
+
69
+ # Abstract base class for documents.
70
+ #
71
+ # Provides a unified interface for different document formats:
72
+ # - Plain text
73
+ # - Markdown
74
+ # AsciiDoc
75
+ # Code files (with syntax awareness)
76
+ #
77
+ # Subclasses implement format-specific parsing and context retrieval.
78
+ #
79
+ # @example Plain text document
80
+ # doc = PlainTextDocument.new("Hello world\n")
81
+ # doc.text_nodes.each { |node| puts node.text }
82
+ #
83
+ # @example Markdown document
84
+ # doc = MarkdownDocument.new("# Title\nParagraph text")
85
+ # doc.text_nodes.each { |node| puts node.text }
86
+ class Document
87
+ attr_reader :content, :format, :language_code
88
+
89
+ # Supported document formats
90
+ FORMATS = {
91
+ text: 'Plain Text',
92
+ markdown: 'Markdown',
93
+ asciidoc: 'AsciiDoc',
94
+ code: 'Code'
95
+ }.freeze
96
+
97
+ # Create a new document.
98
+ #
99
+ # @param content [String] The document content
100
+ # @param format [Symbol] Document format (:text, :markdown, :asciidoc, :code)
101
+ # @param language_code [String] ISO 639-1 language code (default: 'en')
102
+ def initialize(content, format: :text, language_code: 'en')
103
+ raise ArgumentError, "Invalid format: #{format}" unless FORMATS.key?(format)
104
+
105
+ @content = content
106
+ @format = format
107
+ @language_code = language_code
108
+ end
109
+
110
+ # Get all text nodes for spell checking.
111
+ #
112
+ # Subclasses implement format-specific text extraction.
113
+ #
114
+ # @return [Array<TextNode>] Text nodes in the document
115
+ def text_nodes
116
+ raise NotImplementedError, "#{self.class} must implement #text_nodes"
117
+ end
118
+
119
+ # Get node at a specific path (for structured formats).
120
+ #
121
+ # @param path [Array] Node path (e.g., [:paragraph, 3, :text])
122
+ # @return [Object, nil] The node object or nil
123
+ def get_node(path)
124
+ raise NotImplementedError, "#{self.class} must implement #get_node"
125
+ end
126
+
127
+ # Replace text at a specific location.
128
+ #
129
+ # @param location [Location] The location to replace
130
+ # @param new_text [String] The new text
131
+ # @return [Document] New document with replacement applied
132
+ def replace_node(location, new_text)
133
+ raise NotImplementedError, "#{self.class} must implement #replace_node"
134
+ end
135
+
136
+ # Get context around a specific location.
137
+ #
138
+ # @param location [Location] The error location
139
+ # @param window [Integer] Number of lines before/after (default: 5)
140
+ # @return [Models::Context] Context object
141
+ def context_for(location, window: 5)
142
+ raise NotImplementedError, "#{self.class} must implement #context_for"
143
+ end
144
+
145
+ # Apply corrections and return new document.
146
+ #
147
+ # @param corrections [Array<Models::SemanticError>] Errors to fix
148
+ # @return [Document] New document with corrections applied
149
+ def apply(corrections)
150
+ raise NotImplementedError, "#{self.class} must implement #apply"
151
+ end
152
+
153
+ # Get word count.
154
+ #
155
+ # @return [Integer] Total word count
156
+ def word_count
157
+ @content.split(/\s+/).size
158
+ end
159
+
160
+ # Get line count.
161
+ #
162
+ # @return [Integer] Total line count
163
+ def line_count
164
+ @content.lines.size
165
+ end
166
+
167
+ # Get document name (for display).
168
+ #
169
+ # @return [String] Document name or identifier
170
+ def name
171
+ "document"
172
+ end
173
+
174
+ # Detect format from content.
175
+ #
176
+ # @param content [String] The document content
177
+ # @return [Symbol] Detected format
178
+ def self.detect_format(content)
179
+ return :markdown if content.start_with?('#')
180
+ return :code if content.end_with?('.')
181
+ :text
182
+ end
183
+
184
+ # Create document from file.
185
+ #
186
+ # @param path [String] Path to the file
187
+ # @return [Document] Document instance
188
+ def self.from_file(path)
189
+ content = File.read(path, encoding: 'UTF-8')
190
+ format = detect_format(content)
191
+ language_code = detect_language_from_path(path)
192
+
193
+ case format
194
+ when :markdown
195
+ MarkdownDocument.new(content, language_code: language_code)
196
+ when :asciidoc
197
+ AsciidocDocument.new(content, language_code: language_code)
198
+ else
199
+ PlainTextDocument.new(content, language_code: language_code)
200
+ end
201
+ end
202
+
203
+ # Create document from string with format detection.
204
+ #
205
+ # @param content [String] The document content
206
+ # @param language_code [String] Language code (optional)
207
+ # @return [Document] Document instance
208
+ def self.from_string(content, language_code: 'en')
209
+ format = detect_format(content)
210
+ new(content, format: format, language_code: language_code)
211
+ end
212
+
213
+ private
214
+
215
+ # Detect language code from file path.
216
+ #
217
+ # @param path [String] File path
218
+ # @return [String] Language code
219
+ def self.detect_language_from_path(path)
220
+ # Extract from path like "README.en.md" or "document.de.txt"
221
+ if path =~ /\.([a-z]{2})\./i
222
+ Regexp.last_match(1)
223
+ else
224
+ 'en'
225
+ end
226
+ end
227
+ end
228
+ end
229
+ end