hron 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/hron/lexer.rb ADDED
@@ -0,0 +1,253 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "ast"
4
+ require_relative "error"
5
+
6
+ module Hron
7
+ # Token kinds (using symbols and Data classes)
8
+ module TokenKind
9
+ EVERY = :every
10
+ ON = :on
11
+ AT = :at
12
+ FROM = :from
13
+ TO = :to
14
+ IN = :in
15
+ OF = :of
16
+ THE = :the
17
+ LAST = :last
18
+ EXCEPT = :except
19
+ UNTIL = :until
20
+ STARTING = :starting
21
+ DURING = :during
22
+ YEAR = :year
23
+ DAY = :day
24
+ WEEKDAY_KW = :weekday_kw
25
+ WEEKEND_KW = :weekend_kw
26
+ WEEKS = :weeks
27
+ MONTH = :month
28
+ COMMA = :comma
29
+ end
30
+
31
+ # Token types with values
32
+ TDayName = Data.define(:name)
33
+ TMonthName = Data.define(:name)
34
+ TOrdinal = Data.define(:position)
35
+ TIntervalUnit = Data.define(:unit)
36
+ TNumber = Data.define(:value)
37
+ TOrdinalNumber = Data.define(:value)
38
+ TTime = Data.define(:hour, :minute)
39
+ TIsoDate = Data.define(:date)
40
+ TTimezone = Data.define(:tz)
41
+
42
+ # Token with kind and span
43
+ Token = Data.define(:kind, :span)
44
+
45
+ # Keyword mapping
46
+ KEYWORD_MAP = {
47
+ "every" => TokenKind::EVERY,
48
+ "on" => TokenKind::ON,
49
+ "at" => TokenKind::AT,
50
+ "from" => TokenKind::FROM,
51
+ "to" => TokenKind::TO,
52
+ "in" => TokenKind::IN,
53
+ "of" => TokenKind::OF,
54
+ "the" => TokenKind::THE,
55
+ "last" => TokenKind::LAST,
56
+ "except" => TokenKind::EXCEPT,
57
+ "until" => TokenKind::UNTIL,
58
+ "starting" => TokenKind::STARTING,
59
+ "during" => TokenKind::DURING,
60
+ "year" => TokenKind::YEAR,
61
+ "years" => TokenKind::YEAR,
62
+ "day" => TokenKind::DAY,
63
+ "days" => TokenKind::DAY,
64
+ "weekday" => TokenKind::WEEKDAY_KW,
65
+ "weekdays" => TokenKind::WEEKDAY_KW,
66
+ "weekend" => TokenKind::WEEKEND_KW,
67
+ "weekends" => TokenKind::WEEKEND_KW,
68
+ "weeks" => TokenKind::WEEKS,
69
+ "week" => TokenKind::WEEKS,
70
+ "month" => TokenKind::MONTH,
71
+ "months" => TokenKind::MONTH,
72
+ # Day names
73
+ "monday" => TDayName.new(Weekday::MONDAY),
74
+ "mon" => TDayName.new(Weekday::MONDAY),
75
+ "tuesday" => TDayName.new(Weekday::TUESDAY),
76
+ "tue" => TDayName.new(Weekday::TUESDAY),
77
+ "wednesday" => TDayName.new(Weekday::WEDNESDAY),
78
+ "wed" => TDayName.new(Weekday::WEDNESDAY),
79
+ "thursday" => TDayName.new(Weekday::THURSDAY),
80
+ "thu" => TDayName.new(Weekday::THURSDAY),
81
+ "friday" => TDayName.new(Weekday::FRIDAY),
82
+ "fri" => TDayName.new(Weekday::FRIDAY),
83
+ "saturday" => TDayName.new(Weekday::SATURDAY),
84
+ "sat" => TDayName.new(Weekday::SATURDAY),
85
+ "sunday" => TDayName.new(Weekday::SUNDAY),
86
+ "sun" => TDayName.new(Weekday::SUNDAY),
87
+ # Month names
88
+ "january" => TMonthName.new(MonthName::JAN),
89
+ "jan" => TMonthName.new(MonthName::JAN),
90
+ "february" => TMonthName.new(MonthName::FEB),
91
+ "feb" => TMonthName.new(MonthName::FEB),
92
+ "march" => TMonthName.new(MonthName::MAR),
93
+ "mar" => TMonthName.new(MonthName::MAR),
94
+ "april" => TMonthName.new(MonthName::APR),
95
+ "apr" => TMonthName.new(MonthName::APR),
96
+ "may" => TMonthName.new(MonthName::MAY),
97
+ "june" => TMonthName.new(MonthName::JUN),
98
+ "jun" => TMonthName.new(MonthName::JUN),
99
+ "july" => TMonthName.new(MonthName::JUL),
100
+ "jul" => TMonthName.new(MonthName::JUL),
101
+ "august" => TMonthName.new(MonthName::AUG),
102
+ "aug" => TMonthName.new(MonthName::AUG),
103
+ "september" => TMonthName.new(MonthName::SEP),
104
+ "sep" => TMonthName.new(MonthName::SEP),
105
+ "october" => TMonthName.new(MonthName::OCT),
106
+ "oct" => TMonthName.new(MonthName::OCT),
107
+ "november" => TMonthName.new(MonthName::NOV),
108
+ "nov" => TMonthName.new(MonthName::NOV),
109
+ "december" => TMonthName.new(MonthName::DEC),
110
+ "dec" => TMonthName.new(MonthName::DEC),
111
+ # Ordinals
112
+ "first" => TOrdinal.new(OrdinalPosition::FIRST),
113
+ "second" => TOrdinal.new(OrdinalPosition::SECOND),
114
+ "third" => TOrdinal.new(OrdinalPosition::THIRD),
115
+ "fourth" => TOrdinal.new(OrdinalPosition::FOURTH),
116
+ "fifth" => TOrdinal.new(OrdinalPosition::FIFTH),
117
+ # Interval units
118
+ "min" => TIntervalUnit.new(IntervalUnit::MIN),
119
+ "mins" => TIntervalUnit.new(IntervalUnit::MIN),
120
+ "minute" => TIntervalUnit.new(IntervalUnit::MIN),
121
+ "minutes" => TIntervalUnit.new(IntervalUnit::MIN),
122
+ "hour" => TIntervalUnit.new(IntervalUnit::HOURS),
123
+ "hours" => TIntervalUnit.new(IntervalUnit::HOURS),
124
+ "hr" => TIntervalUnit.new(IntervalUnit::HOURS),
125
+ "hrs" => TIntervalUnit.new(IntervalUnit::HOURS)
126
+ }.freeze
127
+
128
+ # Lexer class
129
+ class Lexer
130
+ def initialize(input)
131
+ @input = input
132
+ @pos = 0
133
+ @after_in = false
134
+ end
135
+
136
+ def tokenize
137
+ tokens = []
138
+ loop do
139
+ skip_whitespace
140
+ break if @pos >= @input.length
141
+
142
+ if @after_in
143
+ @after_in = false
144
+ tokens << lex_timezone
145
+ next
146
+ end
147
+
148
+ start = @pos
149
+ ch = @input[@pos]
150
+
151
+ if ch == ","
152
+ @pos += 1
153
+ tokens << Token.new(TokenKind::COMMA, Span.new(start, @pos))
154
+ next
155
+ end
156
+
157
+ if ch.match?(/\d/)
158
+ tokens << lex_number_or_time_or_date
159
+ next
160
+ end
161
+
162
+ if ch.match?(/[a-zA-Z]/)
163
+ tokens << lex_word
164
+ next
165
+ end
166
+
167
+ raise HronError.lex("unexpected character '#{ch}'", Span.new(start, start + 1), @input)
168
+ end
169
+ tokens
170
+ end
171
+
172
+ private
173
+
174
+ def skip_whitespace
175
+ @pos += 1 while @pos < @input.length && @input[@pos].match?(/\s/)
176
+ end
177
+
178
+ def lex_timezone
179
+ skip_whitespace
180
+ start = @pos
181
+ @pos += 1 while @pos < @input.length && !@input[@pos].match?(/\s/)
182
+ tz = @input[start...@pos]
183
+ raise HronError.lex("expected timezone after 'in'", Span.new(start, start + 1), @input) if tz.empty?
184
+
185
+ Token.new(TTimezone.new(tz), Span.new(start, @pos))
186
+ end
187
+
188
+ def lex_number_or_time_or_date
189
+ start = @pos
190
+ @pos += 1 while @pos < @input.length && @input[@pos].match?(/\d/)
191
+ digits = @input[start...@pos]
192
+
193
+ # Check for ISO date: YYYY-MM-DD
194
+ if digits.length == 4 && @pos < @input.length && @input[@pos] == "-"
195
+ remaining = @input[start..]
196
+ if remaining.length >= 10 &&
197
+ remaining[4] == "-" &&
198
+ remaining[5..6].match?(/\d{2}/) &&
199
+ remaining[7] == "-" &&
200
+ remaining[8..9].match?(/\d{2}/)
201
+ @pos = start + 10
202
+ return Token.new(TIsoDate.new(@input[start...@pos]), Span.new(start, @pos))
203
+ end
204
+ end
205
+
206
+ # Check for time: HH:MM
207
+ if digits.length.between?(1, 2) && @pos < @input.length && @input[@pos] == ":"
208
+ @pos += 1 # skip ':'
209
+ min_start = @pos
210
+ @pos += 1 while @pos < @input.length && @input[@pos].match?(/\d/)
211
+ min_digits = @input[min_start...@pos]
212
+ if min_digits.length == 2
213
+ hour = digits.to_i
214
+ minute = min_digits.to_i
215
+ raise HronError.lex("invalid time", Span.new(start, @pos), @input) if hour > 23 || minute > 59
216
+
217
+ return Token.new(TTime.new(hour, minute), Span.new(start, @pos))
218
+ end
219
+ end
220
+
221
+ num = digits.to_i
222
+
223
+ # Check for ordinal suffix: st, nd, rd, th
224
+ if @pos + 1 < @input.length
225
+ suffix = @input[@pos, 2].downcase
226
+ if %w[st nd rd th].include?(suffix)
227
+ @pos += 2
228
+ return Token.new(TOrdinalNumber.new(num), Span.new(start, @pos))
229
+ end
230
+ end
231
+
232
+ Token.new(TNumber.new(num), Span.new(start, @pos))
233
+ end
234
+
235
+ def lex_word
236
+ start = @pos
237
+ @pos += 1 while @pos < @input.length && @input[@pos].match?(/\w/)
238
+ word = @input[start...@pos].downcase
239
+ span = Span.new(start, @pos)
240
+
241
+ kind = KEYWORD_MAP[word]
242
+ raise HronError.lex("unknown keyword '#{word}'", span, @input) if kind.nil?
243
+
244
+ @after_in = true if kind == TokenKind::IN
245
+
246
+ Token.new(kind, span)
247
+ end
248
+ end
249
+
250
+ def self.tokenize(input)
251
+ Lexer.new(input).tokenize
252
+ end
253
+ end