parsanol 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of parsanol might be problematic. Click here for more details.

Files changed (336) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.txt +25 -0
  3. data/LICENSE +23 -0
  4. data/README.adoc +643 -0
  5. data/Rakefile +189 -0
  6. data/example/balanced-parens/basic.rb +42 -0
  7. data/example/balanced-parens/basic.rb.md +86 -0
  8. data/example/balanced-parens/parens.rb +42 -0
  9. data/example/balanced-parens/ruby_transform.rb +162 -0
  10. data/example/big.erb +73 -0
  11. data/example/boolean-algebra/basic.rb +70 -0
  12. data/example/boolean-algebra/basic.rb.md +108 -0
  13. data/example/boolean-algebra/ruby_transform.rb +263 -0
  14. data/example/calculator/basic.rb +153 -0
  15. data/example/calculator/basic.rb.md +120 -0
  16. data/example/calculator/pattern.rb +153 -0
  17. data/example/calculator/ruby_transform.rb +156 -0
  18. data/example/calculator/ruby_transform.rb.md +32 -0
  19. data/example/calculator/serialized.rb +257 -0
  20. data/example/calculator/serialized.rb.md +32 -0
  21. data/example/calculator/transform.rb +153 -0
  22. data/example/calculator/zero_copy.rb +269 -0
  23. data/example/calculator/zero_copy.rb.md +36 -0
  24. data/example/capture/basic.rb +49 -0
  25. data/example/capture/basic.rb.md +106 -0
  26. data/example/capture/example.json +39 -0
  27. data/example/comments/basic.rb +35 -0
  28. data/example/comments/basic.rb.md +110 -0
  29. data/example/csv/ruby_transform.rb +148 -0
  30. data/example/csv/ruby_transform.rb.md +131 -0
  31. data/example/csv/serialized.rb +201 -0
  32. data/example/csv/serialized.rb.md +31 -0
  33. data/example/csv/zero_copy.rb +276 -0
  34. data/example/csv/zero_copy.rb.md +36 -0
  35. data/example/custom_atoms/indent_atom.rb +79 -0
  36. data/example/deepest-errors/basic.rb +131 -0
  37. data/example/deepest-errors/basic.rb.md +152 -0
  38. data/example/documentation/basic.rb +18 -0
  39. data/example/documentation/basic.rb.md +97 -0
  40. data/example/email/basic.rb +55 -0
  41. data/example/email/basic.rb.md +102 -0
  42. data/example/email/ruby_transform.rb +106 -0
  43. data/example/empty/basic.rb +13 -0
  44. data/example/empty/basic.rb.md +73 -0
  45. data/example/empty/example.json +38 -0
  46. data/example/erb/basic.rb +47 -0
  47. data/example/erb/basic.rb.md +103 -0
  48. data/example/erb/optimized.rb +42 -0
  49. data/example/error-reporting/basic.rb +132 -0
  50. data/example/error-reporting/basic.rb.md +122 -0
  51. data/example/expression-evaluator/basic.rb +284 -0
  52. data/example/expression-evaluator/basic.rb.md +138 -0
  53. data/example/ini/basic.rb +154 -0
  54. data/example/ini/basic.rb.md +129 -0
  55. data/example/ini/ruby_transform.rb +154 -0
  56. data/example/ip-address/basic.rb +125 -0
  57. data/example/ip-address/basic.rb.md +139 -0
  58. data/example/iso-6709/basic.rb +231 -0
  59. data/example/iso-6709/basic.rb.md +143 -0
  60. data/example/iso-8601/basic.rb +275 -0
  61. data/example/iso-8601/basic.rb.md +149 -0
  62. data/example/json/basic.rb +128 -0
  63. data/example/json/basic.rb.md +121 -0
  64. data/example/json/pattern.rb +128 -0
  65. data/example/json/ruby_transform.rb +200 -0
  66. data/example/json/ruby_transform.rb.md +32 -0
  67. data/example/json/serialized.rb +233 -0
  68. data/example/json/serialized.rb.md +31 -0
  69. data/example/json/transform.rb +128 -0
  70. data/example/json/zero_copy.rb +316 -0
  71. data/example/json/zero_copy.rb.md +36 -0
  72. data/example/local/basic.rb +34 -0
  73. data/example/local/basic.rb.md +91 -0
  74. data/example/local/example.json +38 -0
  75. data/example/markdown/basic.rb +287 -0
  76. data/example/markdown/basic.rb.md +160 -0
  77. data/example/markup/basic.rb +173 -0
  78. data/example/markup/basic.rb.md +118 -0
  79. data/example/mathn/basic.rb +47 -0
  80. data/example/mathn/basic.rb.md +96 -0
  81. data/example/mathn/example.json +39 -0
  82. data/example/minilisp/basic.rb +94 -0
  83. data/example/minilisp/basic.rb.md +133 -0
  84. data/example/modularity/basic.rb +47 -0
  85. data/example/modularity/basic.rb.md +152 -0
  86. data/example/nested-errors/basic.rb +132 -0
  87. data/example/nested-errors/basic.rb.md +157 -0
  88. data/example/output/boolean_algebra.out +4 -0
  89. data/example/output/calc.out +1 -0
  90. data/example/output/capture.out +3 -0
  91. data/example/output/comments.out +8 -0
  92. data/example/output/deepest_errors.out +54 -0
  93. data/example/output/documentation.err +4 -0
  94. data/example/output/documentation.out +1 -0
  95. data/example/output/email_parser.out +2 -0
  96. data/example/output/empty.err +1 -0
  97. data/example/output/erb.out +7 -0
  98. data/example/output/ignore.out +1 -0
  99. data/example/output/ignore_whitespace.out +1 -0
  100. data/example/output/ip_address.out +9 -0
  101. data/example/output/json.out +5 -0
  102. data/example/output/local.out +3 -0
  103. data/example/output/mathn.out +4 -0
  104. data/example/output/minilisp.out +5 -0
  105. data/example/output/modularity.out +0 -0
  106. data/example/output/nested_errors.out +54 -0
  107. data/example/output/optimized_erb.out +1 -0
  108. data/example/output/parens.out +8 -0
  109. data/example/output/prec_calc.out +5 -0
  110. data/example/output/readme.out +1 -0
  111. data/example/output/scopes.out +1 -0
  112. data/example/output/seasons.out +28 -0
  113. data/example/output/sentence.out +1 -0
  114. data/example/output/simple_xml.out +2 -0
  115. data/example/output/string_parser.out +3 -0
  116. data/example/prec-calc/basic.rb +71 -0
  117. data/example/prec-calc/basic.rb.md +114 -0
  118. data/example/readme/basic.rb +30 -0
  119. data/example/readme/basic.rb.md +80 -0
  120. data/example/scopes/basic.rb +15 -0
  121. data/example/scopes/basic.rb.md +73 -0
  122. data/example/scopes/example.json +38 -0
  123. data/example/seasons/basic.rb +46 -0
  124. data/example/seasons/basic.rb.md +117 -0
  125. data/example/seasons/example.json +40 -0
  126. data/example/sentence/basic.rb +36 -0
  127. data/example/sentence/basic.rb.md +81 -0
  128. data/example/sexp/ruby_transform.rb +180 -0
  129. data/example/sexp/ruby_transform.rb.md +143 -0
  130. data/example/simple-xml/basic.rb +54 -0
  131. data/example/simple-xml/basic.rb.md +125 -0
  132. data/example/simple.lit +3 -0
  133. data/example/string-literal/basic.rb +77 -0
  134. data/example/string-literal/basic.rb.md +128 -0
  135. data/example/test.lit +4 -0
  136. data/example/toml/basic.rb +226 -0
  137. data/example/toml/basic.rb.md +173 -0
  138. data/example/url/basic.rb +219 -0
  139. data/example/url/basic.rb.md +142 -0
  140. data/example/url/ruby_transform.rb +219 -0
  141. data/example/yaml/basic.rb +216 -0
  142. data/example/yaml/basic.rb.md +148 -0
  143. data/ext/parsanol_native/extconf.rb +4 -0
  144. data/lib/parsanol/accelerator/application.rb +62 -0
  145. data/lib/parsanol/accelerator/engine.rb +112 -0
  146. data/lib/parsanol/accelerator.rb +162 -0
  147. data/lib/parsanol/ast_visitor.rb +122 -0
  148. data/lib/parsanol/atoms/alternative.rb +97 -0
  149. data/lib/parsanol/atoms/base.rb +214 -0
  150. data/lib/parsanol/atoms/can_flatten.rb +192 -0
  151. data/lib/parsanol/atoms/capture.rb +41 -0
  152. data/lib/parsanol/atoms/context.rb +351 -0
  153. data/lib/parsanol/atoms/context_optimized.rb +42 -0
  154. data/lib/parsanol/atoms/custom.rb +110 -0
  155. data/lib/parsanol/atoms/cut.rb +62 -0
  156. data/lib/parsanol/atoms/dsl.rb +130 -0
  157. data/lib/parsanol/atoms/dynamic.rb +33 -0
  158. data/lib/parsanol/atoms/entity.rb +55 -0
  159. data/lib/parsanol/atoms/ignored.rb +28 -0
  160. data/lib/parsanol/atoms/infix.rb +121 -0
  161. data/lib/parsanol/atoms/lookahead.rb +64 -0
  162. data/lib/parsanol/atoms/named.rb +50 -0
  163. data/lib/parsanol/atoms/re.rb +61 -0
  164. data/lib/parsanol/atoms/repetition.rb +241 -0
  165. data/lib/parsanol/atoms/scope.rb +28 -0
  166. data/lib/parsanol/atoms/sequence.rb +157 -0
  167. data/lib/parsanol/atoms/str.rb +90 -0
  168. data/lib/parsanol/atoms/visitor.rb +91 -0
  169. data/lib/parsanol/atoms.rb +36 -0
  170. data/lib/parsanol/buffer.rb +130 -0
  171. data/lib/parsanol/builder_callbacks.rb +353 -0
  172. data/lib/parsanol/cause.rb +101 -0
  173. data/lib/parsanol/context.rb +23 -0
  174. data/lib/parsanol/convenience.rb +35 -0
  175. data/lib/parsanol/edit_tracker.rb +107 -0
  176. data/lib/parsanol/error_reporter/contextual.rb +122 -0
  177. data/lib/parsanol/error_reporter/deepest.rb +106 -0
  178. data/lib/parsanol/error_reporter/tree.rb +68 -0
  179. data/lib/parsanol/error_reporter.rb +98 -0
  180. data/lib/parsanol/export.rb +163 -0
  181. data/lib/parsanol/expression/treetop.rb +94 -0
  182. data/lib/parsanol/expression.rb +51 -0
  183. data/lib/parsanol/fast_mode.rb +145 -0
  184. data/lib/parsanol/first_set.rb +75 -0
  185. data/lib/parsanol/grammar_builder.rb +177 -0
  186. data/lib/parsanol/graphviz.rb +97 -0
  187. data/lib/parsanol/incremental_parser.rb +179 -0
  188. data/lib/parsanol/interval_tree.rb +215 -0
  189. data/lib/parsanol/lazy_result.rb +178 -0
  190. data/lib/parsanol/lexer.rb +146 -0
  191. data/lib/parsanol/native/parser.rb +630 -0
  192. data/lib/parsanol/native/serializer.rb +245 -0
  193. data/lib/parsanol/native/transformer.rb +438 -0
  194. data/lib/parsanol/native/types.rb +41 -0
  195. data/lib/parsanol/native.rb +217 -0
  196. data/lib/parsanol/optimizer.rb +86 -0
  197. data/lib/parsanol/optimizers/choice_optimizer.rb +78 -0
  198. data/lib/parsanol/optimizers/cut_inserter.rb +175 -0
  199. data/lib/parsanol/optimizers/lookahead_optimizer.rb +58 -0
  200. data/lib/parsanol/optimizers/quantifier_optimizer.rb +62 -0
  201. data/lib/parsanol/optimizers/sequence_optimizer.rb +97 -0
  202. data/lib/parsanol/options/ruby_transform.rb +109 -0
  203. data/lib/parsanol/options/serialized.rb +94 -0
  204. data/lib/parsanol/options/zero_copy.rb +130 -0
  205. data/lib/parsanol/options.rb +20 -0
  206. data/lib/parsanol/parallel.rb +133 -0
  207. data/lib/parsanol/parsanol_native.bundle +0 -0
  208. data/lib/parsanol/parser.rb +151 -0
  209. data/lib/parsanol/parslet.rb +148 -0
  210. data/lib/parsanol/parslet_native.bundle +0 -0
  211. data/lib/parsanol/pattern/binding.rb +49 -0
  212. data/lib/parsanol/pattern.rb +115 -0
  213. data/lib/parsanol/pool.rb +220 -0
  214. data/lib/parsanol/pools/array_pool.rb +75 -0
  215. data/lib/parsanol/pools/buffer_pool.rb +173 -0
  216. data/lib/parsanol/pools/position_pool.rb +92 -0
  217. data/lib/parsanol/pools/slice_pool.rb +64 -0
  218. data/lib/parsanol/position.rb +89 -0
  219. data/lib/parsanol/result.rb +44 -0
  220. data/lib/parsanol/result_builder.rb +208 -0
  221. data/lib/parsanol/result_stream.rb +262 -0
  222. data/lib/parsanol/rig/rspec.rb +52 -0
  223. data/lib/parsanol/rope.rb +78 -0
  224. data/lib/parsanol/scope.rb +42 -0
  225. data/lib/parsanol/slice.rb +172 -0
  226. data/lib/parsanol/source/line_cache.rb +99 -0
  227. data/lib/parsanol/source.rb +171 -0
  228. data/lib/parsanol/source_location.rb +164 -0
  229. data/lib/parsanol/streaming_parser.rb +124 -0
  230. data/lib/parsanol/string_view.rb +192 -0
  231. data/lib/parsanol/transform.rb +267 -0
  232. data/lib/parsanol/version.rb +5 -0
  233. data/lib/parsanol/wasm/README.md +80 -0
  234. data/lib/parsanol/wasm/package.json +51 -0
  235. data/lib/parsanol/wasm/parsanol.js +252 -0
  236. data/lib/parsanol/wasm/parslet.d.ts +129 -0
  237. data/lib/parsanol/wasm_parser.rb +239 -0
  238. data/lib/parsanol.rb +408 -0
  239. data/parsanol-ruby.gemspec +56 -0
  240. data/spec/acceptance/examples_spec.rb +96 -0
  241. data/spec/acceptance/infix_parser_spec.rb +145 -0
  242. data/spec/acceptance/mixing_parsers_spec.rb +74 -0
  243. data/spec/acceptance/regression_spec.rb +329 -0
  244. data/spec/acceptance/repetition_and_maybe_spec.rb +44 -0
  245. data/spec/acceptance/unconsumed_input_spec.rb +21 -0
  246. data/spec/benchmark/comparative/runner_spec.rb +105 -0
  247. data/spec/integration/array_pooling_spec.rb +193 -0
  248. data/spec/integration/buffer_allocation_spec.rb +324 -0
  249. data/spec/integration/position_pooling_spec.rb +184 -0
  250. data/spec/integration/result_builder_spec.rb +282 -0
  251. data/spec/integration/rope_stringview_integration_spec.rb +188 -0
  252. data/spec/integration/slice_pooling_spec.rb +63 -0
  253. data/spec/integration/string_view_integration_spec.rb +125 -0
  254. data/spec/lexer_spec.rb +231 -0
  255. data/spec/parsanol/atom_results_spec.rb +39 -0
  256. data/spec/parsanol/atoms/alternative_spec.rb +26 -0
  257. data/spec/parsanol/atoms/base_spec.rb +127 -0
  258. data/spec/parsanol/atoms/capture_spec.rb +21 -0
  259. data/spec/parsanol/atoms/combinations_spec.rb +5 -0
  260. data/spec/parsanol/atoms/custom_spec.rb +79 -0
  261. data/spec/parsanol/atoms/dsl_spec.rb +7 -0
  262. data/spec/parsanol/atoms/entity_spec.rb +77 -0
  263. data/spec/parsanol/atoms/ignored_spec.rb +15 -0
  264. data/spec/parsanol/atoms/infix_spec.rb +5 -0
  265. data/spec/parsanol/atoms/lookahead_spec.rb +22 -0
  266. data/spec/parsanol/atoms/named_spec.rb +4 -0
  267. data/spec/parsanol/atoms/re_spec.rb +14 -0
  268. data/spec/parsanol/atoms/repetition_spec.rb +24 -0
  269. data/spec/parsanol/atoms/scope_spec.rb +26 -0
  270. data/spec/parsanol/atoms/sequence_spec.rb +28 -0
  271. data/spec/parsanol/atoms/str_spec.rb +15 -0
  272. data/spec/parsanol/atoms/visitor_spec.rb +101 -0
  273. data/spec/parsanol/atoms_spec.rb +488 -0
  274. data/spec/parsanol/auto_optimize_spec.rb +334 -0
  275. data/spec/parsanol/buffer_spec.rb +219 -0
  276. data/spec/parsanol/builder_callbacks_spec.rb +377 -0
  277. data/spec/parsanol/choice_optimizer_spec.rb +231 -0
  278. data/spec/parsanol/convenience_spec.rb +54 -0
  279. data/spec/parsanol/cut_inserter_spec.rb +248 -0
  280. data/spec/parsanol/cut_spec.rb +66 -0
  281. data/spec/parsanol/edit_tracker_spec.rb +218 -0
  282. data/spec/parsanol/error_reporter/contextual_spec.rb +122 -0
  283. data/spec/parsanol/error_reporter/deepest_spec.rb +82 -0
  284. data/spec/parsanol/error_reporter/tree_spec.rb +7 -0
  285. data/spec/parsanol/export_spec.rb +67 -0
  286. data/spec/parsanol/expression/treetop_spec.rb +75 -0
  287. data/spec/parsanol/first_set_spec.rb +298 -0
  288. data/spec/parsanol/interval_tree_spec.rb +205 -0
  289. data/spec/parsanol/lazy_result_spec.rb +288 -0
  290. data/spec/parsanol/lookahead_optimizer_spec.rb +252 -0
  291. data/spec/parsanol/minilisp.citrus +29 -0
  292. data/spec/parsanol/minilisp.tt +29 -0
  293. data/spec/parsanol/optimizer_spec.rb +459 -0
  294. data/spec/parsanol/options/parslet_compat_spec.rb +166 -0
  295. data/spec/parsanol/options/ruby_transform_spec.rb +70 -0
  296. data/spec/parsanol/options/serialized_spec.rb +69 -0
  297. data/spec/parsanol/options/zero_copy_spec.rb +230 -0
  298. data/spec/parsanol/parser_spec.rb +36 -0
  299. data/spec/parsanol/parslet_spec.rb +38 -0
  300. data/spec/parsanol/pattern_spec.rb +272 -0
  301. data/spec/parsanol/pool_spec.rb +392 -0
  302. data/spec/parsanol/pools/array_pool_spec.rb +356 -0
  303. data/spec/parsanol/pools/buffer_pool_spec.rb +365 -0
  304. data/spec/parsanol/pools/position_pool_spec.rb +118 -0
  305. data/spec/parsanol/pools/slice_pool_spec.rb +262 -0
  306. data/spec/parsanol/position_spec.rb +14 -0
  307. data/spec/parsanol/result_builder_spec.rb +391 -0
  308. data/spec/parsanol/rig/rspec_spec.rb +54 -0
  309. data/spec/parsanol/rope_spec.rb +207 -0
  310. data/spec/parsanol/scope_spec.rb +45 -0
  311. data/spec/parsanol/slice_spec.rb +249 -0
  312. data/spec/parsanol/source/line_cache_spec.rb +74 -0
  313. data/spec/parsanol/source_spec.rb +207 -0
  314. data/spec/parsanol/string_view_spec.rb +345 -0
  315. data/spec/parsanol/transform/context_spec.rb +56 -0
  316. data/spec/parsanol/transform_spec.rb +183 -0
  317. data/spec/parsanol/tree_memoization_spec.rb +149 -0
  318. data/spec/parslet_compatibility/expressir_edge_cases_spec.rb +153 -0
  319. data/spec/parslet_compatibility/minimal_reproduction.rb +199 -0
  320. data/spec/parslet_compatibility_spec.rb +399 -0
  321. data/spec/parslet_imported/atom_spec.rb +93 -0
  322. data/spec/parslet_imported/combinator_spec.rb +161 -0
  323. data/spec/parslet_imported/spec_helper.rb +73 -0
  324. data/spec/performance/batch_parsing_benchmark.rb +129 -0
  325. data/spec/performance/complete_optimization_summary.rb +143 -0
  326. data/spec/performance/grammar_caching_analysis.rb +121 -0
  327. data/spec/performance/grammar_caching_benchmark.rb +80 -0
  328. data/spec/performance/native_benchmark_spec.rb +230 -0
  329. data/spec/performance/phase5_benchmark.rb +144 -0
  330. data/spec/performance/profiling_benchmark.rb +131 -0
  331. data/spec/performance/ruby_improvements_benchmark.rb +171 -0
  332. data/spec/performance_spec.rb +374 -0
  333. data/spec/spec_helper.rb +79 -0
  334. data/spec/support/opal.rb +8 -0
  335. data/spec/support/opal.rb.erb +14 -0
  336. metadata +485 -0
@@ -0,0 +1,351 @@
1
+ module Parsanol::Atoms
2
+ # Helper class that implements a transient cache that maps position and
3
+ # parslet object to results. This is used for memoization in the packrat
4
+ # style.
5
+ #
6
+ # Also, error reporter is stored here and error reporting happens through
7
+ # this class. This makes the reporting pluggable.
8
+ #
9
+ class Context
10
+ # Parser-specific cache thresholds (Session 13)
11
+ # Based on profiling: different parsers have different cache benefit points
12
+ # - JSON: High recursion on large files, but medium files (5KB) see overhead
13
+ # - ERB: Moderate repetition, benefits from cache earlier
14
+ # - Calc: Lower repetition, needs larger input
15
+ # - Sentence: Simple linear grammar, minimal cache benefit
16
+ PARSER_CACHE_THRESHOLDS = {
17
+ 'JsonParser' => 10_000, # High threshold - json/medium regressed at 1000
18
+ 'ErbParser' => 800, # Moderate - working well
19
+ 'CalcParser' => 2000, # Low repetition
20
+ 'SentenceParser' => 5000, # Linear grammar
21
+ :default => 1000
22
+ }.freeze
23
+
24
+ # @param reporter [#err, #err_at] Error reporter (leave empty for default
25
+ # reporter)
26
+ # @param interval_cache [Boolean] Use GPeg-style interval tree caching
27
+ # @param adaptive_cache_threshold [Integer] Disable caching for inputs smaller than this (bytes)
28
+ # @param parser_class [Class] Parser class for per-parser threshold selection
29
+ def initialize(reporter=Parsanol::ErrorReporter::Tree.new, interval_cache: false, adaptive_cache_threshold: nil, parser_class: nil)
30
+ @cache = Hash.new { |h, k| h[k] = {} }
31
+ @reporter = reporter
32
+ @captures = Parsanol::Scope.new
33
+ @max_position = 0 # Track furthest position for cache eviction
34
+ @eviction_threshold = 200 # Evict positions more than 200 bytes behind
35
+ @eviction_counter = 0 # Counter for periodic eviction
36
+ @eviction_frequency = 100 # Only evict every N position advances
37
+
38
+ # Phase 1.3: ArrayPool for reducing GC pressure from array allocations
39
+ # Arrays are the highest allocation source (74% of memory allocations)
40
+ # Initialize pool with reasonable size for typical parsing workloads
41
+ @array_pool = Parsanol::Pools::ArrayPool.new(size: 10000)
42
+
43
+ # Phase 2.1: BufferPool for fixed-size buffer pre-allocation
44
+ # Reduces array allocations through buffer reuse by size class
45
+ @buffer_pool = Parsanol::Pools::BufferPool.new(pool_size: 100)
46
+
47
+ # Selective memoization: track hit/miss rates to only cache beneficial parslets
48
+ @hit_counts = Hash.new(0)
49
+ @miss_counts = Hash.new(0)
50
+ @cache_threshold = 2 # Only cache if we've had 2+ hits
51
+
52
+ # GPeg-style interval tree caching (optional)
53
+ @use_interval_cache = interval_cache
54
+ if @use_interval_cache
55
+ require 'parsanol/interval_tree'
56
+ require 'parsanol/edit_tracker'
57
+ # Map parslet object_id to interval tree
58
+ @interval_cache = Hash.new { |h, k| h[k] = Parsanol::IntervalTree.new }
59
+ # Track edits for lazy position shifts
60
+ @edit_tracker = Parsanol::EditTracker.new
61
+ end
62
+
63
+ # Cut operator support (Phase 46b)
64
+ # Track the last cut position to enable aggressive cache eviction
65
+ @last_cut_position = 0
66
+
67
+ # Adaptive caching (Session 12-13): Disable cache for small inputs
68
+ # Session 13: Per-parser thresholds based on profiling
69
+ # - JSON medium (5KB) regressed with 1000-byte threshold → raised to 10KB
70
+ # - Different parsers have different cache benefit points
71
+
72
+ # Determine threshold: explicit > parser-specific > default
73
+ threshold = adaptive_cache_threshold
74
+ if threshold.nil? && parser_class
75
+ # Extract simple class name (e.g., "MyJson::Parser" -> "Parser")
76
+ parser_name = parser_class.name&.split('::')&.last
77
+ threshold = PARSER_CACHE_THRESHOLDS[parser_name] || PARSER_CACHE_THRESHOLDS[:default]
78
+ end
79
+ threshold ||= PARSER_CACHE_THRESHOLDS[:default]
80
+
81
+ @adaptive_cache_threshold = threshold
82
+ @input_size = nil # Will be set on first parse attempt
83
+ @caching_enabled = nil # Will be determined based on input size
84
+ end
85
+
86
+ # Caches a parse answer for obj at source.pos. Applying the same parslet
87
+ # at one position of input always yields the same result, unless the input
88
+ # has changed.
89
+ #
90
+ # We need the entire source here so we can ask for how many characters
91
+ # were consumed by a successful parse. Imitation of such a parse must
92
+ # advance the input pos by the same amount of bytes.
93
+ #
94
+ def try_with_cache(obj, source, consume_all)
95
+ # Skip caching entirely for atoms that don't benefit from it
96
+ unless obj.cached?
97
+ return obj.try(source, self, consume_all)
98
+ end
99
+
100
+ # Session 12: Adaptive caching based on input size
101
+ # Determine if caching should be enabled (only on first call)
102
+ if @caching_enabled.nil?
103
+ # Get total input size from source
104
+ input_size = source.bytepos + source.chars_left
105
+ @input_size = input_size
106
+ @caching_enabled = input_size >= @adaptive_cache_threshold
107
+ end
108
+
109
+ # For small inputs, skip caching entirely - the overhead exceeds benefit
110
+ # Profiling shows cache overhead is 15-20% for inputs < 1000 bytes
111
+ unless @caching_enabled
112
+ return obj.try(source, self, consume_all)
113
+ end
114
+
115
+ # Phase 55: Cache ivars to reduce lookup overhead in hot method
116
+ use_interval_cache = @use_interval_cache
117
+ cache = @cache
118
+ hit_counts = @hit_counts
119
+ miss_counts = @miss_counts
120
+ cache_threshold = @cache_threshold
121
+
122
+ # Use interval-based caching if enabled (GPeg-style)
123
+ if use_interval_cache
124
+ return try_with_interval_cache(obj, source, consume_all)
125
+ end
126
+
127
+ beg = source.bytepos
128
+ cache_key = obj.object_id
129
+
130
+ # Track furthest position and evict old cache entries PERIODICALLY
131
+ # In left-to-right parsing, positions far behind won't be revisited
132
+ if beg > @max_position
133
+ @max_position = beg
134
+ eviction_counter = @eviction_counter + 1
135
+ @eviction_counter = eviction_counter
136
+
137
+ # Evict positions that are too far behind current position
138
+ # This prevents unbounded cache growth (O(n*m) memory issue in packrat)
139
+ # Phase 42: Only evict periodically instead of on every position advance
140
+ # This reduces delete_if calls from ~900K to ~9K (100x reduction)
141
+ if eviction_counter >= @eviction_frequency
142
+ @eviction_counter = 0
143
+ min_keep_pos = beg - @eviction_threshold
144
+ cache.delete_if { |pos, _| pos < min_keep_pos }
145
+ end
146
+ end
147
+
148
+ # Check if this parslet/position combo is already cached
149
+ if cache[beg].key?(cache_key)
150
+ # Cache hit - track it
151
+ hit_counts[cache_key] += 1
152
+ result, advance = cache[beg][cache_key]
153
+ source.bytepos = beg + advance
154
+ return result
155
+ end
156
+
157
+ # Cache miss - execute the parslet
158
+ miss_counts[cache_key] += 1
159
+ result = obj.try(source, self, consume_all)
160
+ advance = source.bytepos - beg
161
+
162
+ # Only cache if this parslet has shown it benefits from caching
163
+ # (has had multiple hits, or we're still learning about it)
164
+ total_attempts = hit_counts[cache_key] + miss_counts[cache_key]
165
+ if total_attempts <= cache_threshold || hit_counts[cache_key] > 0
166
+ cache[beg][cache_key] = [result, advance]
167
+ end
168
+
169
+ return result
170
+ end
171
+
172
+ # GPeg-style interval-based caching
173
+ # Caches results keyed by intervals [start, end) rather than single positions
174
+ # This enables efficient invalidation of changed regions during incremental parsing
175
+ def try_with_interval_cache(obj, source, consume_all)
176
+ beg = source.bytepos
177
+ cache_key = obj.object_id
178
+
179
+ # Try to find exact match in interval tree
180
+ tree = @interval_cache[cache_key]
181
+ result_data = tree.query_exact(beg, beg) # Start with point query
182
+
183
+ if result_data
184
+ # Exact match found - restore result
185
+ @hit_counts[cache_key] += 1
186
+ result, advance = result_data
187
+ source.bytepos = beg + advance
188
+ return result
189
+ end
190
+
191
+ # No exact match - execute the parslet
192
+ @miss_counts[cache_key] += 1
193
+ result = obj.try(source, self, consume_all)
194
+ advance = source.bytepos - beg
195
+ end_pos = beg + advance
196
+
197
+ # Store in interval tree: [start, end) -> [result, advance]
198
+ # Only cache if beneficial (selective memoization)
199
+ total_attempts = @hit_counts[cache_key] + @miss_counts[cache_key]
200
+ if total_attempts <= @cache_threshold || @hit_counts[cache_key] > 0
201
+ tree.insert(beg, end_pos, [result, advance])
202
+ end
203
+
204
+ return result
205
+ end
206
+
207
+ # Pre-allocated constants to avoid repeated array allocations
208
+ # These are the most common return values during parsing
209
+ SUCCESS_NIL = [true, nil].freeze
210
+ ERROR_NIL = [false, nil].freeze
211
+
212
+ # Report an error at a given position.
213
+ # @see ErrorReporter
214
+ #
215
+ def err_at(*args)
216
+ return [false, @reporter.err_at(*args)] if @reporter
217
+ ERROR_NIL
218
+ end
219
+
220
+ # Report an error.
221
+ # @see ErrorReporter
222
+ #
223
+ def err(*args)
224
+ return [false, @reporter.err(*args)] if @reporter
225
+ ERROR_NIL
226
+ end
227
+
228
+ # Report a successful parse.
229
+ # @see ErrorReporter::Contextual
230
+ #
231
+ def succ(*args)
232
+ # The default error reporter (Tree) has an empty succ method that returns nil
233
+ # So for the common case (no reporter or default reporter), use pre-allocated constant
234
+ return SUCCESS_NIL unless @reporter
235
+ result = @reporter.succ(*args)
236
+ return SUCCESS_NIL if result.nil?
237
+ [true, result]
238
+ end
239
+
240
+ # Returns the current captures made on the input (see
241
+ # Parsanol::Atoms::Base#capture). Use as follows:
242
+ #
243
+ # context.captures[:foobar] # => returns capture :foobar
244
+ #
245
+ attr_reader :captures
246
+
247
+ # Phase 1.3: Expose ArrayPool for array acquisition/release
248
+ # @return [Parsanol::Pools::ArrayPool] The array pool instance
249
+ attr_reader :array_pool
250
+
251
+ # Acquire an array from the pool.
252
+ # Returns a cleared, empty array ready for use.
253
+ #
254
+ # @return [Array] An empty array from the pool
255
+ def acquire_array
256
+ @array_pool.acquire
257
+ end
258
+
259
+ # Release an array back to the pool.
260
+ # The array will be cleared and made available for reuse.
261
+ #
262
+ # @param array [Array] The array to return to the pool
263
+ # @return [Boolean] true if returned to pool, false if discarded
264
+ def release_array(array)
265
+ @array_pool.release(array)
266
+ end
267
+
268
+ # Phase 2.1: Expose BufferPool for buffer acquisition/release
269
+ # @return [Parsanol::Pools::BufferPool] The buffer pool instance
270
+ attr_reader :buffer_pool
271
+
272
+ # Acquire a buffer from the pool with specified minimum capacity.
273
+ #
274
+ # @param size [Integer] Minimum required capacity
275
+ # @return [Parsanol::Buffer] Buffer with capacity >= size
276
+ def acquire_buffer(size:)
277
+ @buffer_pool.acquire(size: size)
278
+ end
279
+
280
+ # Release a buffer back to the pool.
281
+ # The buffer will be cleared and made available for reuse.
282
+ #
283
+ # @param buffer [Parsanol::Buffer] The buffer to return to the pool
284
+ # @return [Boolean] true if returned to pool, false if discarded
285
+ def release_buffer(buffer)
286
+ @buffer_pool.release(buffer)
287
+ end
288
+
289
+ # Starts a new scope. Use the #scope method of Parsanol::Atoms::DSL
290
+ # to call this.
291
+ #
292
+ def scope
293
+ captures.push
294
+ yield
295
+ ensure
296
+ captures.pop
297
+ end
298
+
299
+ # GPeg-style tree memoization support
300
+ # Check if tree memoization is enabled
301
+ def use_tree_memoization?
302
+ @use_interval_cache
303
+ end
304
+
305
+ # Query tree memo cache for a given key and position
306
+ # Returns [values, end_pos] if found, nil otherwise
307
+ def query_tree_memo(cache_key, start_pos)
308
+ return nil unless @use_interval_cache
309
+ tree = @interval_cache[cache_key]
310
+ # Query for any intervals that overlap with [start_pos, start_pos+1)
311
+ # This will find intervals that start at start_pos
312
+ overlapping = tree.query_overlapping(start_pos, start_pos + 1)
313
+ # Find exact match where interval starts at start_pos
314
+ result = overlapping.find { |interval, _data| interval[0] == start_pos }
315
+ result ? result[1] : nil
316
+ end
317
+
318
+ # Store tree memo: cache array of values for repetition
319
+ def store_tree_memo(cache_key, start_pos, values, end_pos)
320
+ return unless @use_interval_cache
321
+ tree = @interval_cache[cache_key]
322
+ tree.insert(start_pos, end_pos, [values, end_pos])
323
+ end
324
+
325
+ # Cut operator support (Phase 46b)
326
+ # Called when a cut operator succeeds. This enables aggressive cache eviction
327
+ # by marking that we won't backtrack before this position.
328
+ #
329
+ # @param position [Integer] The position where the cut occurred
330
+ def cut!(position)
331
+ @last_cut_position = position
332
+
333
+ # Aggressively evict all cache entries before the cut position
334
+ # This is safe because we won't backtrack past the cut point
335
+ # This is the key to achieving O(1) space complexity with cuts
336
+ @cache.delete_if { |pos, _| pos < position }
337
+ end
338
+
339
+ private
340
+ # NOTE These methods use #object_id directly, since that seems to bring the
341
+ # most performance benefit. This is a hot spot; going through
342
+ # Atoms::Base#hash doesn't yield as much.
343
+ #
344
+ def lookup(obj, pos)
345
+ @cache[pos][obj.object_id]
346
+ end
347
+ def set(obj, pos, val)
348
+ @cache[pos][obj.object_id] = val
349
+ end
350
+ end
351
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Experimental: Position-based cache eviction for Context
4
+ # Based on PEG theory: in linear parsing, positions behind current position
5
+ # will never be revisited, so we can evict them to reduce memory
6
+
7
+ module Parsanol
8
+ module Atoms
9
+ class Context
10
+ # Add position tracking for cache eviction
11
+ attr_reader :current_position
12
+
13
+ def try_with_cache(obj, source, consume_all)
14
+ unless obj.cached?
15
+ return obj.try(source, self, consume_all)
16
+ end
17
+
18
+ key = source.pos
19
+ @current_position = key
20
+ atom_cache = @cache[obj]
21
+
22
+ # Try to fetch from cache
23
+ if atom_cache.key?(key)
24
+ return atom_cache.fetch(key)
25
+ end
26
+
27
+ # Cache miss - compute result
28
+ result = obj.try(source, self, consume_all)
29
+ atom_cache[key] = result
30
+
31
+ # Evict old positions if cache is getting large
32
+ # Keep only positions within a window of current position
33
+ if atom_cache.size > 100
34
+ min_pos = key - 50 # Keep 50 positions behind
35
+ atom_cache.delete_if { |pos, _| pos < min_pos }
36
+ end
37
+
38
+ result
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parsanol
4
+ module Atoms
5
+ # Base class for creating custom parser atoms.
6
+ #
7
+ # Custom atoms allow extending Parsanol with domain-specific matching logic
8
+ # that cannot be expressed with the built-in combinators.
9
+ #
10
+ # @example Custom atom for matching indentation-sensitive content
11
+ # class IndentAtom < Parsanol::Atoms::Custom
12
+ # def initialize(expected_indent)
13
+ # @expected_indent = expected_indent
14
+ # super()
15
+ # end
16
+ #
17
+ # # Required: Implement try_match
18
+ # def try_match(source, context, consume_all)
19
+ # pos = source.pos
20
+ # indent = count_indent(source)
21
+ #
22
+ # if indent == @expected_indent
23
+ # content = read_until_newline(source)
24
+ # [true, content]
25
+ # else
26
+ # source.pos = pos # Restore position on failure
27
+ # [false, nil]
28
+ # end
29
+ # end
30
+ #
31
+ # private
32
+ #
33
+ # def count_indent(source)
34
+ # # ... implementation ...
35
+ # end
36
+ # end
37
+ #
38
+ # # Usage in parser
39
+ # class MyParser < Parsanol::Parser
40
+ # rule(:indented_line) { IndentAtom.new(2) }
41
+ # end
42
+ #
43
+ class Custom < Base
44
+ # Required: Implement this method to define matching behavior
45
+ #
46
+ # @param source [Parsanol::Source] The input source with position tracking
47
+ # @param context [Parsanol::Atoms::Context] Parse context for memoization
48
+ # @param consume_all [Boolean] If true, must consume entire input
49
+ # @return [Array<Boolean, Object>] Tuple of [success, result]
50
+ # - success: true if match succeeded, false otherwise
51
+ # - result: matched value on success, nil on failure
52
+ #
53
+ # @note You MUST restore source.bytepos on failure for proper backtracking
54
+ #
55
+ def try_match(source, context, consume_all)
56
+ raise NotImplementedError,
57
+ "Custom atoms must implement #try_match(source, context, consume_all)"
58
+ end
59
+
60
+ # Override of Base#try that delegates to try_match
61
+ # Handles error reporting and result wrapping
62
+ #
63
+ # @api private
64
+ def try(source, context, consume_all)
65
+ success, result = try_match(source, context, consume_all)
66
+
67
+ if success
68
+ [true, result]
69
+ else
70
+ # Generate error cause for reporting
71
+ context.err(
72
+ self,
73
+ source,
74
+ "Failed to match custom atom: #{self.class.name}"
75
+ )
76
+ end
77
+ end
78
+
79
+ # Optional: Override to provide first set for optimization
80
+ # Returns the set of characters/strings this atom can match at start
81
+ #
82
+ # @return [Set<String>, nil] First set, or nil if not determinable
83
+ def first_set
84
+ nil # Unknown by default
85
+ end
86
+
87
+ # Optional: Override to enable caching for this atom
88
+ # Return false for context-dependent matching (e.g., indentation)
89
+ #
90
+ # @return [Boolean] true if atom can be cached
91
+ def cacheable?
92
+ true
93
+ end
94
+
95
+ # Optional: Override to provide custom serialization for native parser
96
+ # Return nil if atom cannot be serialized (must use pure Ruby mode)
97
+ #
98
+ # @return [Hash, nil] JSON-serializable representation
99
+ def to_native_format
100
+ nil # Not serializable by default
101
+ end
102
+
103
+ # Override to_s_inner for debug printing
104
+ # @api private
105
+ def to_s_inner(prec = nil)
106
+ "custom(#{self.class.name})"
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,62 @@
1
+ # Cut operator for PEG grammars
2
+ #
3
+ # A cut operator (↑) instructs the parser to discard backtrack information
4
+ # at a specific point. This enables more aggressive cache eviction and can
5
+ # reduce space complexity from O(n) to O(1).
6
+ #
7
+ # Reference: Mizushima et al. (2010) "Packrat Parsers Can Handle Practical
8
+ # Grammars in Mostly Constant Space"
9
+ #
10
+ # Example:
11
+ #
12
+ # rule(:statement) {
13
+ # str('if').cut >> condition >> then_clause |
14
+ # str('while').cut >> condition >> body |
15
+ # str('print').cut >> expression
16
+ # }
17
+ #
18
+ # After 'if' succeeds, the cut discards backtrack info for 'while' and 'print'.
19
+ # This means if the parse fails later in the 'if' branch, we won't try the
20
+ # other alternatives.
21
+ #
22
+ class Parsanol::Atoms::Cut < Parsanol::Atoms::Base
23
+ attr_reader :parslet
24
+
25
+ def initialize(parslet)
26
+ super()
27
+ @parslet = parslet
28
+ end
29
+
30
+ def try(source, context, consume_all)
31
+ # First, try to match the parslet
32
+ success, value = parslet.apply(source, context, consume_all)
33
+
34
+ return [success, value] unless success
35
+
36
+ # On success, signal to context that a cut has occurred
37
+ # This allows the context to:
38
+ # 1. Mark the current position as a cut point
39
+ # 2. Empty the backtrack stack (we won't backtrack past here)
40
+ # 3. Aggressively evict cache entries before this position
41
+ if context.respond_to?(:cut!)
42
+ context.cut!(source.bytepos)
43
+ end
44
+
45
+ return [success, value]
46
+ end
47
+
48
+ # Cut doesn't need caching - it's a thin wrapper
49
+ def cached?
50
+ false
51
+ end
52
+
53
+ def to_s_inner(prec)
54
+ "#{parslet.to_s(prec)}↑"
55
+ end
56
+
57
+ # FIRST set of cut is same as wrapped parslet
58
+ # Cut doesn't change matching behavior, only affects backtracking
59
+ def compute_first_set
60
+ parslet.first_set
61
+ end
62
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+
4
+ # A mixin module that defines operations that can be called on any subclass
5
+ # of Parsanol::Atoms::Base. These operations make parslets atoms chainable and
6
+ # allow combination of parslet atoms to form bigger parsers.
7
+ #
8
+ # Example:
9
+ #
10
+ # str('foo') >> str('bar')
11
+ # str('f').repeat
12
+ # any.absent? # also called The Epsilon
13
+ #
14
+ module Parsanol::Atoms::DSL
15
+ # Construct a new atom that repeats the current atom min times at least and
16
+ # at most max times. max can be nil to indicate that no maximum is present.
17
+ #
18
+ # Example:
19
+ # # match any number of 'a's
20
+ # str('a').repeat
21
+ #
22
+ # # match between 1 and 3 'a's
23
+ # str('a').repeat(1,3)
24
+ #
25
+ def repeat(min=0, max=nil)
26
+ Parsanol::Atoms::Repetition.new(self, min, max)
27
+ end
28
+
29
+ # Returns a new parslet atom that is only maybe present in the input. This
30
+ # is synonymous to calling #repeat(0,1). Generated tree value will be
31
+ # either nil (if atom is not present in the input) or the matched subtree.
32
+ #
33
+ # Example:
34
+ # str('foo').maybe
35
+ #
36
+ def maybe
37
+ Parsanol::Atoms::Repetition.new(self, 0, 1, :maybe)
38
+ end
39
+
40
+ # Returns a new parslet atom that will not show up in the output. This
41
+ # is synonymous to calling #repeat(0,1). Generated tree value will always be
42
+ # nil.
43
+ #
44
+ # Example:
45
+ # str('foo').ignore
46
+ #
47
+ def ignore
48
+ Parsanol::Atoms::Ignored.new(self)
49
+ end
50
+
51
+ # Chains two parslet atoms together as a sequence.
52
+ #
53
+ # Example:
54
+ # str('a') >> str('b')
55
+ #
56
+ def >>(parslet)
57
+ Parsanol::Atoms::Sequence.new(self, parslet)
58
+ end
59
+
60
+ # Chains two parslet atoms together to express alternation. A match will
61
+ # always be attempted with the parslet on the left side first. If it doesn't
62
+ # match, the right side will be tried.
63
+ #
64
+ # Example:
65
+ # # matches either 'a' OR 'b'
66
+ # str('a') | str('b')
67
+ #
68
+ def |(parslet)
69
+ Parsanol::Atoms::Alternative.new(self, parslet)
70
+ end
71
+
72
+ # Tests for absence of a parslet atom in the input stream without consuming
73
+ # it.
74
+ #
75
+ # Example:
76
+ # # Only proceed the parse if 'a' is absent.
77
+ # str('a').absent?
78
+ #
79
+ def absent?
80
+ Parsanol::Atoms::Lookahead.new(self, false)
81
+ end
82
+
83
+ # Tests for presence of a parslet atom in the input stream without consuming
84
+ # it.
85
+ #
86
+ # Example:
87
+ # # Only proceed the parse if 'a' is present.
88
+ # str('a').present?
89
+ #
90
+ def present?
91
+ Parsanol::Atoms::Lookahead.new(self, true)
92
+ end
93
+
94
+ # Marks a parslet atom as important for the tree output. This must be used
95
+ # to achieve meaningful output from the #parse method.
96
+ #
97
+ # Example:
98
+ # str('a').as(:b) # will produce {:b => 'a'}
99
+ #
100
+ def as(name)
101
+ Parsanol::Atoms::Named.new(self, name)
102
+ end
103
+
104
+ # Captures a part of the input and stores it under the name given. This
105
+ # is very useful to create self-referential parses. A capture stores
106
+ # the result of its parse (may be complex) on a successful parse action.
107
+ #
108
+ # Example:
109
+ # str('a').capture(:b) # will store captures[:b] == 'a'
110
+ #
111
+ def capture(name)
112
+ Parsanol::Atoms::Capture.new(self, name)
113
+ end
114
+
115
+ # Marks this parslet atom as a cut point. After this atom succeeds,
116
+ # the parser will discard backtrack information, enabling O(1) space
117
+ # complexity. Use with caution: cuts prevent backtracking to alternative
118
+ # branches.
119
+ #
120
+ # Example:
121
+ # str('if').cut >> condition >> then_clause |
122
+ # str('while') >> condition >> body
123
+ #
124
+ # If 'if' matches, we commit to the first branch. If condition or then_clause
125
+ # fail, we won't try the 'while' alternative.
126
+ #
127
+ def cut
128
+ Parsanol::Atoms::Cut.new(self)
129
+ end
130
+ end