gmail_search_syntax 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/gmail_search_syntax/tokenizer.rb +8 -1
- data/lib/gmail_search_syntax/version.rb +1 -1
- data/slop/EMBEDDED_HYPHENS.md +102 -0
- data/test/gmail_search_syntax_test.rb +85 -0
- data/test/tokenizer_test.rb +58 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 47b56ee5467d6808c4ceae00289bf291551374a3e5854ed5223a5b2f5ca2f9ac
|
|
4
|
+
data.tar.gz: 27aa483d8296eb2a3e775aecfaeffc0813f60936cd9091e976eb5159559cafc9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0170d5419e8ab3335a3bd4494f09c3b6e6f5a143cf26791a383f2189a9e12fcd49a596e56555670d018a6ffdffa13b13eed27e554cbe5ff82a221dcd12ed4bd7
|
|
7
|
+
data.tar.gz: 95d48f2e6aedb4160634db949e32be25823f8b808e71dc113f3ce9e8490b507e136a96372c329f8ff03f2476bc52f4dd31fc89f67a9ba2b7e11e5e0bcebf241f
|
|
@@ -55,9 +55,16 @@ module GmailSearchSyntax
|
|
|
55
55
|
advance
|
|
56
56
|
when "-"
|
|
57
57
|
next_char = peek_char
|
|
58
|
-
|
|
58
|
+
prev_char = (@position > 0) ? @input[@position - 1] : nil
|
|
59
|
+
# Negation requires: non-whitespace follows AND (start of input OR whitespace precedes)
|
|
60
|
+
# Gmail behavior: "Coxlee-Gammage" → Coxlee AND Gammage (hyphen is word separator)
|
|
61
|
+
# "Coxlee -Gammage" → Coxlee AND NOT Gammage (space+hyphen = negation)
|
|
62
|
+
if next_char && next_char !~ /\s/ && (prev_char.nil? || prev_char =~ /\s/)
|
|
59
63
|
add_token(:minus, char)
|
|
60
64
|
advance
|
|
65
|
+
elsif prev_char && prev_char !~ /\s/
|
|
66
|
+
# Embedded hyphen (preceded by non-whitespace) - skip it as word separator
|
|
67
|
+
advance
|
|
61
68
|
else
|
|
62
69
|
read_word
|
|
63
70
|
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Embedded Hyphens in Gmail Search
|
|
2
|
+
|
|
3
|
+
## Gmail's Actual Behavior
|
|
4
|
+
|
|
5
|
+
Gmail treats hyphens differently depending on whether they are preceded by whitespace:
|
|
6
|
+
|
|
7
|
+
### Embedded Hyphen (No Preceding Whitespace)
|
|
8
|
+
|
|
9
|
+
When a hyphen appears immediately after a word character (no space before it), Gmail treats it as a **word separator**, not a negation operator. Both parts become separate search tokens that are implicitly ANDed together.
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Coxlee-Gammage
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Gmail behavior: Search for messages containing both "Coxlee" AND "Gammage". Both tokens get highlighted in search results.
|
|
16
|
+
|
|
17
|
+
Parsed as:
|
|
18
|
+
```ruby
|
|
19
|
+
GmailSearchSyntax.parse!("Coxlee-Gammage")
|
|
20
|
+
# => #<And [#<StringToken "Coxlee">, #<StringToken "Gammage">]>
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Space + Hyphen (Negation)
|
|
24
|
+
|
|
25
|
+
When a hyphen is preceded by whitespace (or at the start of input), it functions as the **negation operator**.
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
Coxlee -Gammage
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Gmail behavior: Search for messages containing "Coxlee" but NOT "Gammage".
|
|
32
|
+
|
|
33
|
+
Parsed as:
|
|
34
|
+
```ruby
|
|
35
|
+
GmailSearchSyntax.parse!("Coxlee -Gammage")
|
|
36
|
+
# => #<And [#<StringToken "Coxlee">, #<Not #<StringToken "Gammage">>]>
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Examples
|
|
40
|
+
|
|
41
|
+
| Query | Parsed As | Meaning |
|
|
42
|
+
|-------|-----------|---------|
|
|
43
|
+
| `some-outfit` | `some AND outfit` | Contains both "some" and "outfit" |
|
|
44
|
+
| `some -outfit` | `some AND NOT outfit` | Contains "some" but not "outfit" |
|
|
45
|
+
| `a-b-c` | `a AND b AND c` | Contains all three tokens |
|
|
46
|
+
| `-spam` | `NOT spam` | Does not contain "spam" |
|
|
47
|
+
| `cats-dogs -birds` | `cats AND dogs AND NOT birds` | Contains "cats" and "dogs", not "birds" |
|
|
48
|
+
|
|
49
|
+
## Real-World Use Cases
|
|
50
|
+
|
|
51
|
+
### Hyphenated Names
|
|
52
|
+
```
|
|
53
|
+
from:Mary-Jane
|
|
54
|
+
```
|
|
55
|
+
Searches for emails where "from" contains "Mary" AND message contains "Jane".
|
|
56
|
+
|
|
57
|
+
### Hyphenated Terms
|
|
58
|
+
```
|
|
59
|
+
self-service
|
|
60
|
+
```
|
|
61
|
+
Finds messages containing both "self" and "service".
|
|
62
|
+
|
|
63
|
+
### Compound Words
|
|
64
|
+
```
|
|
65
|
+
e-commerce
|
|
66
|
+
```
|
|
67
|
+
Finds messages containing both "e" and "commerce".
|
|
68
|
+
|
|
69
|
+
## Implementation Details
|
|
70
|
+
|
|
71
|
+
The fix is in the tokenizer (`lib/gmail_search_syntax/tokenizer.rb`). When encountering a `-` character:
|
|
72
|
+
|
|
73
|
+
1. Check if there's a non-whitespace character following (potential negation or word separator)
|
|
74
|
+
2. Check if there's whitespace (or nothing) preceding the hyphen
|
|
75
|
+
3. If preceded by whitespace or at start of input: treat as negation operator (`:minus` token)
|
|
76
|
+
4. If preceded by non-whitespace: skip the hyphen (acts as word separator, no token emitted)
|
|
77
|
+
|
|
78
|
+
```ruby
|
|
79
|
+
when "-"
|
|
80
|
+
next_char = peek_char
|
|
81
|
+
prev_char = @position > 0 ? @input[@position - 1] : nil
|
|
82
|
+
|
|
83
|
+
if next_char && next_char !~ /\s/ && (prev_char.nil? || prev_char =~ /\s/)
|
|
84
|
+
# Negation: preceded by whitespace or at start, followed by non-whitespace
|
|
85
|
+
add_token(:minus, char)
|
|
86
|
+
advance
|
|
87
|
+
elsif prev_char && prev_char !~ /\s/
|
|
88
|
+
# Embedded hyphen: preceded by non-whitespace - skip as word separator
|
|
89
|
+
advance
|
|
90
|
+
else
|
|
91
|
+
read_word
|
|
92
|
+
end
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Bug That Was Fixed
|
|
96
|
+
|
|
97
|
+
Previously, the gem incorrectly treated all hyphens followed by non-whitespace as negation:
|
|
98
|
+
|
|
99
|
+
- **Old (incorrect):** `some-outfit` was parsed as `some AND NOT outfit`
|
|
100
|
+
- **New (correct):** `some-outfit` is parsed as `some AND outfit`
|
|
101
|
+
|
|
102
|
+
This matches Gmail's actual search behavior where hyphenated terms find messages containing both parts of the hyphenated word.
|
|
@@ -142,6 +142,91 @@ class GmailSearchSyntaxTest < Minitest::Test
|
|
|
142
142
|
assert_equal "movie", ast.operands[1].child.value
|
|
143
143
|
end
|
|
144
144
|
|
|
145
|
+
# Gmail behavior: embedded hyphens (no preceding whitespace) are word separators, not negation
|
|
146
|
+
# "Coxlee-Gammage" → Coxlee AND Gammage (both tokens highlighted)
|
|
147
|
+
# "Coxlee -Gammage" → Coxlee AND NOT Gammage (space+hyphen = negation)
|
|
148
|
+
|
|
149
|
+
def test_embedded_hyphen_is_word_separator
|
|
150
|
+
# Gmail behavior: hyphen without preceding whitespace separates words, not negation
|
|
151
|
+
ast = GmailSearchSyntax.parse!("some-outfit")
|
|
152
|
+
assert_instance_of And, ast
|
|
153
|
+
|
|
154
|
+
assert_equal 2, ast.operands.length
|
|
155
|
+
assert_instance_of StringToken, ast.operands[0]
|
|
156
|
+
assert_equal "some", ast.operands[0].value
|
|
157
|
+
|
|
158
|
+
assert_instance_of StringToken, ast.operands[1]
|
|
159
|
+
assert_equal "outfit", ast.operands[1].value
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def test_embedded_hyphen_multiple
|
|
163
|
+
# Multiple hyphens: a-b-c → a AND b AND c
|
|
164
|
+
ast = GmailSearchSyntax.parse!("a-b-c")
|
|
165
|
+
assert_instance_of And, ast
|
|
166
|
+
|
|
167
|
+
assert_equal 3, ast.operands.length
|
|
168
|
+
assert_equal "a", ast.operands[0].value
|
|
169
|
+
assert_equal "b", ast.operands[1].value
|
|
170
|
+
assert_equal "c", ast.operands[2].value
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def test_embedded_hyphen_real_name
|
|
174
|
+
# Real-world case: hyphenated names
|
|
175
|
+
ast = GmailSearchSyntax.parse!("Coxlee-Gammage")
|
|
176
|
+
assert_instance_of And, ast
|
|
177
|
+
|
|
178
|
+
assert_equal 2, ast.operands.length
|
|
179
|
+
assert_equal "Coxlee", ast.operands[0].value
|
|
180
|
+
assert_equal "Gammage", ast.operands[1].value
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def test_space_hyphen_is_negation
|
|
184
|
+
# Space + hyphen = negation (unchanged behavior)
|
|
185
|
+
ast = GmailSearchSyntax.parse!("cats -dogs")
|
|
186
|
+
assert_instance_of And, ast
|
|
187
|
+
|
|
188
|
+
assert_equal 2, ast.operands.length
|
|
189
|
+
assert_instance_of StringToken, ast.operands[0]
|
|
190
|
+
assert_equal "cats", ast.operands[0].value
|
|
191
|
+
|
|
192
|
+
assert_instance_of Not, ast.operands[1]
|
|
193
|
+
assert_equal "dogs", ast.operands[1].child.value
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def test_embedded_hyphen_combined_with_negation
|
|
197
|
+
# Mixed: embedded hyphen + space-preceded negation
|
|
198
|
+
ast = GmailSearchSyntax.parse!("some-outfit -dogs")
|
|
199
|
+
assert_instance_of And, ast
|
|
200
|
+
|
|
201
|
+
assert_equal 3, ast.operands.length
|
|
202
|
+
assert_equal "some", ast.operands[0].value
|
|
203
|
+
assert_equal "outfit", ast.operands[1].value
|
|
204
|
+
assert_instance_of Not, ast.operands[2]
|
|
205
|
+
assert_equal "dogs", ast.operands[2].child.value
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def test_negation_at_start_of_input
|
|
209
|
+
# Negation at start of input still works
|
|
210
|
+
ast = GmailSearchSyntax.parse!("-spam")
|
|
211
|
+
assert_instance_of Not, ast
|
|
212
|
+
assert_equal "spam", ast.child.value
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def test_embedded_hyphen_with_operator
|
|
216
|
+
# Embedded hyphen in operator context
|
|
217
|
+
ast = GmailSearchSyntax.parse!("from:mary-jane")
|
|
218
|
+
assert_instance_of And, ast
|
|
219
|
+
|
|
220
|
+
# "from:mary" becomes operator, "-jane" is embedded hyphen → "jane" is separate word
|
|
221
|
+
assert_equal 2, ast.operands.length
|
|
222
|
+
assert_instance_of Operator, ast.operands[0]
|
|
223
|
+
assert_equal "from", ast.operands[0].name
|
|
224
|
+
assert_equal "mary", ast.operands[0].value
|
|
225
|
+
|
|
226
|
+
assert_instance_of StringToken, ast.operands[1]
|
|
227
|
+
assert_equal "jane", ast.operands[1].value
|
|
228
|
+
end
|
|
229
|
+
|
|
145
230
|
def test_around_operator
|
|
146
231
|
ast = GmailSearchSyntax.parse!("holiday AROUND 10 vacation")
|
|
147
232
|
assert_instance_of Around, ast
|
data/test/tokenizer_test.rb
CHANGED
|
@@ -94,6 +94,64 @@ class TokenizerTest < Minitest::Test
|
|
|
94
94
|
assert_token_stream(expected, tokens)
|
|
95
95
|
end
|
|
96
96
|
|
|
97
|
+
def test_tokenize_embedded_hyphen
|
|
98
|
+
# Gmail behavior: embedded hyphen (no preceding whitespace) is a word separator, not negation
|
|
99
|
+
tokens = tokenize("some-outfit")
|
|
100
|
+
expected = [
|
|
101
|
+
{type: :word, value: "some"},
|
|
102
|
+
{type: :word, value: "outfit"},
|
|
103
|
+
{type: :eof}
|
|
104
|
+
]
|
|
105
|
+
assert_token_stream(expected, tokens)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def test_tokenize_multiple_embedded_hyphens
|
|
109
|
+
# Multiple hyphens: a-b-c → three separate words
|
|
110
|
+
tokens = tokenize("a-b-c")
|
|
111
|
+
expected = [
|
|
112
|
+
{type: :word, value: "a"},
|
|
113
|
+
{type: :word, value: "b"},
|
|
114
|
+
{type: :word, value: "c"},
|
|
115
|
+
{type: :eof}
|
|
116
|
+
]
|
|
117
|
+
assert_token_stream(expected, tokens)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def test_tokenize_hyphenated_name
|
|
121
|
+
# Real-world case: hyphenated names like "Coxlee-Gammage"
|
|
122
|
+
tokens = tokenize("Coxlee-Gammage")
|
|
123
|
+
expected = [
|
|
124
|
+
{type: :word, value: "Coxlee"},
|
|
125
|
+
{type: :word, value: "Gammage"},
|
|
126
|
+
{type: :eof}
|
|
127
|
+
]
|
|
128
|
+
assert_token_stream(expected, tokens)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def test_tokenize_negation_at_start
|
|
132
|
+
# Negation at start of input
|
|
133
|
+
tokens = tokenize("-spam")
|
|
134
|
+
expected = [
|
|
135
|
+
{type: :minus},
|
|
136
|
+
{type: :word, value: "spam"},
|
|
137
|
+
{type: :eof}
|
|
138
|
+
]
|
|
139
|
+
assert_token_stream(expected, tokens)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def test_tokenize_embedded_hyphen_vs_negation
|
|
143
|
+
# Mixed: embedded hyphen + space-preceded negation
|
|
144
|
+
tokens = tokenize("some-outfit -dogs")
|
|
145
|
+
expected = [
|
|
146
|
+
{type: :word, value: "some"},
|
|
147
|
+
{type: :word, value: "outfit"},
|
|
148
|
+
{type: :minus},
|
|
149
|
+
{type: :word, value: "dogs"},
|
|
150
|
+
{type: :eof}
|
|
151
|
+
]
|
|
152
|
+
assert_token_stream(expected, tokens)
|
|
153
|
+
end
|
|
154
|
+
|
|
97
155
|
def test_tokenize_around
|
|
98
156
|
tokens = tokenize("holiday AROUND 10 vacation")
|
|
99
157
|
expected = [
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gmail_search_syntax
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- me@julik.nl
|
|
@@ -92,6 +92,7 @@ files:
|
|
|
92
92
|
- lib/gmail_search_syntax/tokenizer.rb
|
|
93
93
|
- lib/gmail_search_syntax/version.rb
|
|
94
94
|
- slop/ARCHITECTURE.md
|
|
95
|
+
- slop/EMBEDDED_HYPHENS.md
|
|
95
96
|
- slop/GMAIL_BEHAVIOR_COMPARISON.md
|
|
96
97
|
- slop/GMAIL_COMPATIBILITY_COMPLETE.md
|
|
97
98
|
- slop/GREEDY_VS_NON_GREEDY_TOKENIZATION.md
|