price_scanner 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.reek.yml +6 -0
- data/.rspec +3 -0
- data/.rubocop.yml +53 -0
- data/LICENSE +21 -0
- data/README.md +177 -0
- data/Rakefile +9 -0
- data/lib/price_scanner/consent_detector.rb +65 -0
- data/lib/price_scanner/detector.rb +140 -0
- data/lib/price_scanner/parser.rb +127 -0
- data/lib/price_scanner/version.rb +5 -0
- data/lib/price_scanner.rb +30 -0
- metadata +57 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 9402118514fcd9aff9b7e4ea1b56d53e95f92c8cb93b8d58d8769f8ee9e5608c
|
|
4
|
+
data.tar.gz: 35dd2b7a9d8c96e1d21214813bfe368ded2e1dce42cf0f45d1bdc52c6ce12345
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: d3911479ea0209ef0caf8584cfb3f09d21567be2322ee4950b560b121364e66b7038771da64d329d31bc6f664c0a0149dac662748e362e00159a3350bcd7654d
|
|
7
|
+
data.tar.gz: 48b172e116ce23cfc3607517bf21d179f0758e02ab4c51fb6043ff2fc64d54314548daa2d0030d8015a572979dc31bd0e4bf9749d4548423fa7636ec5742749f
|
data/.reek.yml
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 3.1
|
|
3
|
+
NewCops: enable
|
|
4
|
+
|
|
5
|
+
require:
|
|
6
|
+
- rubocop-rspec
|
|
7
|
+
- rubocop-rake
|
|
8
|
+
|
|
9
|
+
Style/StringLiterals:
|
|
10
|
+
EnforcedStyle: double_quotes
|
|
11
|
+
|
|
12
|
+
Style/StringLiteralsInInterpolation:
|
|
13
|
+
EnforcedStyle: double_quotes
|
|
14
|
+
|
|
15
|
+
Layout/LineLength:
|
|
16
|
+
Max: 140
|
|
17
|
+
|
|
18
|
+
Metrics/BlockLength:
|
|
19
|
+
Exclude:
|
|
20
|
+
- "spec/**/*"
|
|
21
|
+
|
|
22
|
+
Metrics/ClassLength:
|
|
23
|
+
Max: 150
|
|
24
|
+
|
|
25
|
+
Metrics/MethodLength:
|
|
26
|
+
Max: 30
|
|
27
|
+
|
|
28
|
+
Metrics/ModuleLength:
|
|
29
|
+
Max: 150
|
|
30
|
+
|
|
31
|
+
Metrics/AbcSize:
|
|
32
|
+
Max: 35
|
|
33
|
+
|
|
34
|
+
Metrics/CyclomaticComplexity:
|
|
35
|
+
Max: 15
|
|
36
|
+
|
|
37
|
+
Metrics/PerceivedComplexity:
|
|
38
|
+
Max: 15
|
|
39
|
+
|
|
40
|
+
Style/Documentation:
|
|
41
|
+
Enabled: false
|
|
42
|
+
|
|
43
|
+
RSpec/NestedGroups:
|
|
44
|
+
Enabled: false
|
|
45
|
+
|
|
46
|
+
RSpec/MultipleExpectations:
|
|
47
|
+
Enabled: false
|
|
48
|
+
|
|
49
|
+
RSpec/ExampleLength:
|
|
50
|
+
Enabled: false
|
|
51
|
+
|
|
52
|
+
RSpec/ExpectActual:
|
|
53
|
+
Enabled: false
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Justyna Wojtczak
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# PriceScanner
|
|
2
|
+
|
|
3
|
+
Battle-tested multi-currency price extraction from text. Supports PLN, EUR, GBP, USD with Polish and English number formats.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
gem "price_scanner"
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
### Parse a single price
|
|
14
|
+
|
|
15
|
+
```ruby
|
|
16
|
+
PriceScanner.parse("1.299,00 zł")
|
|
17
|
+
# => { amount: 1299.0, currency: "PLN", text: "1.299,00 zł" }
|
|
18
|
+
|
|
19
|
+
PriceScanner.parse("£49.99")
|
|
20
|
+
# => { amount: 49.99, currency: "GBP", text: "£49.99" }
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Extract all prices from text
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
PriceScanner.scan("Was £49.99 Now £29.99")
|
|
27
|
+
# => [{ amount: 49.99, currency: "GBP", text: "£49.99" },
|
|
28
|
+
# { amount: 29.99, currency: "GBP", text: "£29.99" }]
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Check if text contains a price
|
|
32
|
+
|
|
33
|
+
```ruby
|
|
34
|
+
PriceScanner.contains_price?("Only 99,00 zł") # => true
|
|
35
|
+
PriceScanner.contains_price?("No price here") # => false
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### GDPR consent detection (optional, requires nokogiri)
|
|
39
|
+
|
|
40
|
+
```ruby
|
|
41
|
+
require "nokogiri"
|
|
42
|
+
|
|
43
|
+
doc = Nokogiri::HTML(html)
|
|
44
|
+
node = doc.css(".cookie-banner").first
|
|
45
|
+
PriceScanner::ConsentDetector.consent_node?(node) # => true/false
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Advanced API
|
|
49
|
+
|
|
50
|
+
For finer control, use `Detector` and `Parser` modules directly.
|
|
51
|
+
|
|
52
|
+
#### Detect prices in text
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
PriceScanner::Detector.contains_price?("see price: 49,00 zł") # => true
|
|
56
|
+
|
|
57
|
+
PriceScanner::Detector.extract_prices_from_text("Was 49,00 zł, now 29,00 zł")
|
|
58
|
+
# => [{ text: "49,00 zł", value: 49.0, position: 4 },
|
|
59
|
+
# { text: "29,00 zł", value: 29.0, position: 18 }]
|
|
60
|
+
|
|
61
|
+
PriceScanner::Detector::PRICE_PATTERN # => Regexp matching prices
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
#### Parse and normalize prices
|
|
65
|
+
|
|
66
|
+
```ruby
|
|
67
|
+
PriceScanner::Parser.normalized_price("1.299,00 zł") # => 1299.0
|
|
68
|
+
PriceScanner::Parser.normalized_price("$49.99") # => 49.99
|
|
69
|
+
|
|
70
|
+
PriceScanner::Parser.extract_currency("49,00 zł") # => "PLN"
|
|
71
|
+
PriceScanner::Parser.extract_currency("€120") # => "EUR"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
#### Strip price mentions from text
|
|
75
|
+
|
|
76
|
+
```ruby
|
|
77
|
+
PriceScanner::Parser.strip_price_mentions("Buy for 49,00 zł or 59,00 zł", "49,00 zł", "59,00 zł")
|
|
78
|
+
# => "Buy for or"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
#### Build a regex for a specific price value
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
PriceScanner::Parser.price_regex_from_value("1.299,00 zł")
|
|
85
|
+
# => Regexp matching variations like "1 299,00 zł", "1299,00zł", etc.
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Supported currencies
|
|
89
|
+
|
|
90
|
+
| Currency | Symbol | Code | Example input | Parsed |
|
|
91
|
+
|----------|--------|------|---------------|--------|
|
|
92
|
+
| PLN | `zł`, `zl` | `PLN` | `1.299,00 zł` | 1299.0 |
|
|
93
|
+
| EUR | `€` | `EUR` | `€99,00` | 99.0 |
|
|
94
|
+
| USD | `$` | `USD` | `$1,019.00` | 1019.0 |
|
|
95
|
+
| GBP | `£` | `GBP` | `£49.99` | 49.99 |
|
|
96
|
+
|
|
97
|
+
Currency symbols and codes are matched case-insensitively (`pln`, `PLN`, `Pln` all work).
|
|
98
|
+
|
|
99
|
+
## Supported number formats
|
|
100
|
+
|
|
101
|
+
| Format | Example | Parsed |
|
|
102
|
+
|--------|---------|--------|
|
|
103
|
+
| Dot thousands, comma decimal (Polish) | `1.299,00 zł` | 1299.0 |
|
|
104
|
+
| Space thousands, comma decimal | `1 299,00 zł` | 1299.0 |
|
|
105
|
+
| NBSP thousands, comma decimal | `1\u00a0299,00 zł` | 1299.0 |
|
|
106
|
+
| Comma thousands, dot decimal (English) | `$1,299.00` | 1299.0 |
|
|
107
|
+
| No thousands separator | `799,00 zł` | 799.0 |
|
|
108
|
+
| Integer (no decimals) | `£150` | 150.0 |
|
|
109
|
+
| Currency before amount | `zł 248,86` | 248.86 |
|
|
110
|
+
|
|
111
|
+
## Smart filtering
|
|
112
|
+
|
|
113
|
+
Prices that match the following patterns are automatically excluded from results:
|
|
114
|
+
|
|
115
|
+
### Negative prices
|
|
116
|
+
|
|
117
|
+
Prices preceded by `-` or `−` (U+2212) are treated as discount badges and filtered out.
|
|
118
|
+
|
|
119
|
+
```ruby
|
|
120
|
+
PriceScanner.scan("449,00 zł -100,00 zł 349,00 zł")
|
|
121
|
+
# => [{ amount: 449.0, ... }, { amount: 349.0, ... }]
|
|
122
|
+
# -100,00 zł is excluded
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Price ranges
|
|
126
|
+
|
|
127
|
+
Two prices connected by an en-dash (`–`, `—`) or spaced hyphen (` - `) are recognized as a range and both are removed.
|
|
128
|
+
|
|
129
|
+
```ruby
|
|
130
|
+
PriceScanner.scan("Size S–XL, £3.29 – £92.71, buy now for £49.99")
|
|
131
|
+
# => [{ amount: 49.99, currency: "GBP", text: "£49.99" }]
|
|
132
|
+
# range £3.29 – £92.71 is excluded
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Savings amounts
|
|
136
|
+
|
|
137
|
+
When 3+ prices are detected and one equals the difference between two others (within ±2% tolerance), the savings amount is removed.
|
|
138
|
+
|
|
139
|
+
```ruby
|
|
140
|
+
PriceScanner.scan("Was 449,00 zł, now 349,00 zł. You save 100,00 zł!")
|
|
141
|
+
# => [{ amount: 449.0, ... }, { amount: 349.0, ... }]
|
|
142
|
+
# 100,00 zł is excluded (449 - 349 = 100)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Per-unit prices
|
|
146
|
+
|
|
147
|
+
Prices followed by a unit indicator are filtered out.
|
|
148
|
+
|
|
149
|
+
Supported units: `kg`, `g`, `mg`, `l`, `ml`, `szt`, `m`, `m²`, `m³`, `cm`, `mm`, `op`, `opak`, `pcs`, `pc`, `unit`, `each`, `ea`, `kaps`, `tabl`, `tab`
|
|
150
|
+
|
|
151
|
+
Recognized prefixes: `/` (slash) and `za` (Polish "per").
|
|
152
|
+
|
|
153
|
+
```ruby
|
|
154
|
+
PriceScanner.scan("32,74 zł/kg — buy 500g for 16,37 zł")
|
|
155
|
+
# => [{ amount: 16.37, currency: "PLN", text: "16,37 zł" }]
|
|
156
|
+
# 32,74 zł/kg is excluded
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Deduplication
|
|
160
|
+
|
|
161
|
+
If the same price value appears multiple times, only one occurrence is kept.
|
|
162
|
+
|
|
163
|
+
## Features
|
|
164
|
+
|
|
165
|
+
- **Zero dependencies** (nokogiri optional, only for consent detection)
|
|
166
|
+
- Case-insensitive currency matching
|
|
167
|
+
- Handles regular spaces, non-breaking spaces (NBSP), and mixed whitespace
|
|
168
|
+
- Tracks position of each price in the source text
|
|
169
|
+
- Ignores letter-preceded numbers to avoid false positives from product codes (e.g. `DKA2zł`)
|
|
170
|
+
|
|
171
|
+
## Used by
|
|
172
|
+
|
|
173
|
+
- [snipe.sale](https://snipe.sale) — price tracking service processing thousands of product pages daily
|
|
174
|
+
|
|
175
|
+
## License
|
|
176
|
+
|
|
177
|
+
MIT License. See [LICENSE](LICENSE).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PriceScanner
|
|
4
|
+
# Detects GDPR/cookie consent banners in HTML nodes (requires nokogiri).
|
|
5
|
+
module ConsentDetector
|
|
6
|
+
CONSENT_TEXT_REGEX = /
|
|
7
|
+
\bcookie\b|\bcookies\b|\bconsent\b|\bgdpr\b|\bprivacy\b|\btracking\b|\bpreferences\b|\bpersonaliz|marketing\s+cookies|
|
|
8
|
+
do\s+not\s+sell|opt\s+out|opt\s+in|cookie\s+policy|privacy\s+policy|
|
|
9
|
+
\bciasteczk(?:a|i|ami|ach|om)?\b|\bprywatn|\bzgod(?:a|y|ę|zie)?\b|\brodo\b
|
|
10
|
+
/ix
|
|
11
|
+
CONSENT_ACTION_REGEX = /
|
|
12
|
+
\baccept\b|\bagree\b|\ballow\b|\bmanage\b|\bpreferences\b|\bdecline\b|\breject\b|\bok\b|\bokay\b|\bcontinue\b|save\s+preferences|
|
|
13
|
+
accept\s+all|allow\s+all|got\s+it|\brozumiem\b|\bzgadzam\b|\bakceptuj|\bzaakceptuj|\bodrzuc|\bodmow
|
|
14
|
+
/ix
|
|
15
|
+
CONSENT_ATTR_REGEX = /
|
|
16
|
+
cookie|consent|gdpr|privacy|cmp|onetrust|trustarc|cookielaw|cookiebot|osano|
|
|
17
|
+
quantcast|usercentrics|didomi|cookieyes|termly|iubenda|shopify-pc__banner
|
|
18
|
+
/ix
|
|
19
|
+
|
|
20
|
+
ANCESTOR_DEPTH = 3
|
|
21
|
+
|
|
22
|
+
module_function
|
|
23
|
+
|
|
24
|
+
def consent_node?(node)
|
|
25
|
+
return false unless node
|
|
26
|
+
|
|
27
|
+
nodes = [node] + node.ancestors.take(ANCESTOR_DEPTH)
|
|
28
|
+
hits = detect_hits(nodes)
|
|
29
|
+
text_hit = hits[:text]
|
|
30
|
+
attr_hit = hits[:attr]
|
|
31
|
+
return false unless text_hit || attr_hit
|
|
32
|
+
|
|
33
|
+
(text_hit && hits[:action]) || attr_hit
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def detect_hits(nodes)
|
|
37
|
+
result = { text: false, attr: false, action: false }
|
|
38
|
+
nodes.each do |item|
|
|
39
|
+
result[:text] ||= item.text.to_s.match?(CONSENT_TEXT_REGEX)
|
|
40
|
+
result[:attr] ||= attribute_text(item).match?(CONSENT_ATTR_REGEX)
|
|
41
|
+
result[:action] ||= action_button?(item)
|
|
42
|
+
end
|
|
43
|
+
result
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
ATTR_KEYS = %w[id class role aria-label aria-modal].freeze
|
|
47
|
+
ACTION_SELECTOR = "button, [role='button'], input[type='button'], input[type='submit'], a"
|
|
48
|
+
|
|
49
|
+
def attribute_text(node)
|
|
50
|
+
ATTR_KEYS.filter_map { |key| node[key] }.join(" ")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def action_button?(node)
|
|
54
|
+
node.css(ACTION_SELECTOR).any? do |button|
|
|
55
|
+
collect_text(button).match?(CONSENT_ACTION_REGEX)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def collect_text(node)
|
|
60
|
+
[node.text, node["aria-label"], node["title"], node["value"]].compact.join(" ")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private_class_method :detect_hits, :attribute_text, :action_button?, :collect_text
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PriceScanner
|
|
4
|
+
# Extracts prices from text using regex patterns with smart filtering.
|
|
5
|
+
module Detector
|
|
6
|
+
PRICE_PATTERN = /
|
|
7
|
+
(?:zł|pln|€|\$|£)[\s\u00a0]*(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})(?:[.,]\d{1,2})? |
|
|
8
|
+
(?<![a-zA-Z])(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})[.,]\d{2}[\s\u00a0]*(?:zł|pln|€|\$|£|eur|usd|gbp)(?!\d) |
|
|
9
|
+
(?<![a-zA-Z])(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})[\s\u00a0]*(?:zł|pln|€|\$|£)(?!\d)
|
|
10
|
+
/ix
|
|
11
|
+
|
|
12
|
+
PER_UNIT_PATTERN = %r{(?:/\s*|za\s+)(?:kg|g|mg|l|ml|szt|m[²³23]?|cm|mm|op|opak|pcs|pc|unit|each|ea|kaps|tabl|tab)\b}i
|
|
13
|
+
PER_UNIT_ANCHOR = /\A#{PER_UNIT_PATTERN.source}/i
|
|
14
|
+
|
|
15
|
+
NEGATIVE_PREFIXES = ["-", "\u2212"].freeze
|
|
16
|
+
|
|
17
|
+
RANGE_SEPARATOR_PATTERN = /\s*[–—]\s*|\s+-\s+/
|
|
18
|
+
|
|
19
|
+
TEXT_AFTER_LOOKAHEAD = 200
|
|
20
|
+
MIN_PRICES_FOR_RANGE = 2
|
|
21
|
+
MIN_PRICES_FOR_SAVINGS = 3
|
|
22
|
+
SAVINGS_MIN_RATIO = 0.1
|
|
23
|
+
SAVINGS_MIN_DIFF = 0.01
|
|
24
|
+
SAVINGS_TOLERANCE_RATIO = 0.02
|
|
25
|
+
SAVINGS_TOLERANCE_MIN = 1.0
|
|
26
|
+
|
|
27
|
+
module_function
|
|
28
|
+
|
|
29
|
+
def extract_prices_from_text(text)
|
|
30
|
+
text_str = text.to_s
|
|
31
|
+
raw_prices = scan_raw_prices(text_str)
|
|
32
|
+
filtered = filter_range_prices(raw_prices, text_str)
|
|
33
|
+
unique = filtered.uniq { |price| price[:value] }
|
|
34
|
+
filter_savings_by_difference(unique)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def contains_price?(text)
|
|
38
|
+
text.to_s.match?(PRICE_PATTERN)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def scan_raw_prices(text_str)
|
|
42
|
+
results = []
|
|
43
|
+
last_end = 0
|
|
44
|
+
|
|
45
|
+
text_str.scan(PRICE_PATTERN) do |match_str|
|
|
46
|
+
result, last_end = find_price_at(text_str, match_str, last_end)
|
|
47
|
+
results << result if result
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
results
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def find_price_at(text_str, match_str, search_from)
|
|
54
|
+
return [nil, search_from] if match_str.empty?
|
|
55
|
+
|
|
56
|
+
match_index = text_str.index(match_str, search_from)
|
|
57
|
+
return [nil, search_from] unless match_index
|
|
58
|
+
|
|
59
|
+
match_end = match_index + match_str.length
|
|
60
|
+
[build_price_result(text_str, match_str, match_index), match_end]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def build_price_result(text_str, match_str, match_index)
|
|
64
|
+
value = Parser.normalized_price(match_str)
|
|
65
|
+
return unless value
|
|
66
|
+
|
|
67
|
+
return if negative_price?(text_str, match_index)
|
|
68
|
+
return if per_unit_price?(text_str, match_index + match_str.length)
|
|
69
|
+
|
|
70
|
+
clean_text = match_str.gsub(Parser::COLLAPSE_WHITESPACE, " ").strip
|
|
71
|
+
{ text: clean_text, value: value, position: match_index }
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def negative_price?(text_str, match_index)
|
|
75
|
+
match_index.positive? && NEGATIVE_PREFIXES.include?(text_str[match_index - 1])
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def per_unit_price?(text_str, match_end)
|
|
79
|
+
text_after = text_str[match_end, TEXT_AFTER_LOOKAHEAD].to_s.gsub(Parser::COLLAPSE_WHITESPACE, " ").lstrip
|
|
80
|
+
text_after.match?(PER_UNIT_ANCHOR)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def filter_range_prices(prices, text)
|
|
84
|
+
return prices if prices.size < MIN_PRICES_FOR_RANGE
|
|
85
|
+
|
|
86
|
+
range_indices = find_range_indices(prices, text)
|
|
87
|
+
prices.reject.with_index { |_, idx| range_indices.include?(idx) }
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def find_range_indices(prices, text)
|
|
91
|
+
indices = Set.new
|
|
92
|
+
prices.each_cons(2).with_index do |(current, next_price), idx|
|
|
93
|
+
if range_between?(current, next_price, text)
|
|
94
|
+
indices << idx
|
|
95
|
+
indices << (idx + 1)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
indices
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def range_between?(current, next_price, text)
|
|
102
|
+
start_pos = current[:position] + current[:text].length
|
|
103
|
+
end_pos = next_price[:position]
|
|
104
|
+
return false if end_pos <= start_pos
|
|
105
|
+
|
|
106
|
+
text[start_pos...end_pos].match?(RANGE_SEPARATOR_PATTERN)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def filter_savings_by_difference(prices)
|
|
110
|
+
return prices if prices.size < MIN_PRICES_FOR_SAVINGS
|
|
111
|
+
|
|
112
|
+
values = prices.map { |entry| entry[:value] }
|
|
113
|
+
min_value = values.min
|
|
114
|
+
|
|
115
|
+
return prices unless savings_amount?(values, min_value)
|
|
116
|
+
|
|
117
|
+
prices.zip(values).filter_map { |price, val| price unless val == min_value }
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def savings_amount?(values, min_value)
|
|
121
|
+
values.combination(2).any? do |first, second|
|
|
122
|
+
next false if first == min_value || second == min_value
|
|
123
|
+
|
|
124
|
+
matches_savings_pattern?((first - second).abs, min_value)
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def matches_savings_pattern?(diff, min_value)
|
|
129
|
+
return false if diff < [min_value * SAVINGS_MIN_RATIO, SAVINGS_MIN_DIFF].max
|
|
130
|
+
|
|
131
|
+
tolerance = [min_value * SAVINGS_TOLERANCE_RATIO, SAVINGS_TOLERANCE_MIN].max
|
|
132
|
+
(min_value - diff).abs <= tolerance
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
private_class_method :scan_raw_prices, :find_price_at, :build_price_result,
|
|
136
|
+
:negative_price?, :per_unit_price?,
|
|
137
|
+
:filter_range_prices, :find_range_indices, :range_between?,
|
|
138
|
+
:filter_savings_by_difference, :savings_amount?, :matches_savings_pattern?
|
|
139
|
+
end
|
|
140
|
+
end
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PriceScanner
|
|
4
|
+
# Normalizes price strings into floats and extracts currency codes.
|
|
5
|
+
module Parser
|
|
6
|
+
CURRENCY_MAP = {
|
|
7
|
+
"zł" => "PLN", "pln" => "PLN", "zl" => "PLN",
|
|
8
|
+
"€" => "EUR", "eur" => "EUR",
|
|
9
|
+
"$" => "USD", "usd" => "USD",
|
|
10
|
+
"£" => "GBP", "gbp" => "GBP"
|
|
11
|
+
}.freeze
|
|
12
|
+
|
|
13
|
+
CURRENCY_SYMBOLS = CURRENCY_MAP.keys.map { |key| Regexp.escape(key) }.freeze
|
|
14
|
+
CURRENCY_REGEX = /(#{CURRENCY_SYMBOLS.join("|")})/i
|
|
15
|
+
CURRENCY_SUFFIX = /(?:#{CURRENCY_SYMBOLS.join("|")})/i
|
|
16
|
+
|
|
17
|
+
MULTIPLE_SPACES = /\s{2,}/
|
|
18
|
+
COLLAPSE_WHITESPACE = /\s+/
|
|
19
|
+
NBSP = "\u00a0"
|
|
20
|
+
DECIMAL_PLACES = 2
|
|
21
|
+
THOUSANDS_GROUP = /.{1,3}/
|
|
22
|
+
|
|
23
|
+
module_function
|
|
24
|
+
|
|
25
|
+
def normalized_price(value)
|
|
26
|
+
text = value.to_s.tr(NBSP, " ").strip
|
|
27
|
+
return nil if text.empty?
|
|
28
|
+
|
|
29
|
+
clean = clean_price_text(text)
|
|
30
|
+
return nil unless clean
|
|
31
|
+
|
|
32
|
+
Float(clean)
|
|
33
|
+
rescue ArgumentError, TypeError
|
|
34
|
+
nil
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def extract_currency(value)
|
|
38
|
+
text = value.to_s
|
|
39
|
+
return nil if text.empty?
|
|
40
|
+
|
|
41
|
+
match = text.match(CURRENCY_REGEX)
|
|
42
|
+
resolve_currency(match)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def strip_price_mentions(text, *prices)
|
|
46
|
+
cleaned = text.to_s.tr(NBSP, " ")
|
|
47
|
+
prices.compact.each do |price|
|
|
48
|
+
cleaned = strip_single_price(cleaned, price)
|
|
49
|
+
end
|
|
50
|
+
cleaned.gsub(MULTIPLE_SPACES, " ").strip
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def price_regex_from_value(value)
|
|
54
|
+
integer, decimals = split_price_parts(value)
|
|
55
|
+
int_pattern = thousands_pattern(integer)
|
|
56
|
+
/#{int_pattern}[.,]#{decimals}\s?#{CURRENCY_SUFFIX.source}?/i
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def split_price_parts(value)
|
|
60
|
+
format("%.#{DECIMAL_PLACES}f", value).split(".")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def thousands_groups(integer)
|
|
64
|
+
integer.reverse.scan(THOUSANDS_GROUP).map(&:reverse).reverse
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def thousands_pattern(integer)
|
|
68
|
+
thousands_groups(integer).join("[\\s\\u00a0]?")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def clean_price_text(text)
|
|
72
|
+
digits = text.gsub(/[^\d.,\s]/, "")
|
|
73
|
+
return nil if digits.empty?
|
|
74
|
+
|
|
75
|
+
normalize_separators(digits).gsub(/\s/, "")
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def resolve_currency(match)
|
|
79
|
+
return nil unless match
|
|
80
|
+
|
|
81
|
+
symbol = match[1]
|
|
82
|
+
CURRENCY_MAP.fetch(symbol.downcase, symbol.upcase)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def strip_single_price(cleaned, price)
|
|
86
|
+
normalized = price.to_s.tr(NBSP, " ").strip
|
|
87
|
+
return cleaned if normalized.empty?
|
|
88
|
+
|
|
89
|
+
result = cleaned.gsub(normalized, "").gsub(normalized.delete(" "), "")
|
|
90
|
+
price_value = normalized_price(price)
|
|
91
|
+
return result unless price_value
|
|
92
|
+
|
|
93
|
+
result.gsub(price_regex_from_value(price_value), "")
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def normalize_separators(clean)
|
|
97
|
+
return clean unless clean.include?(",")
|
|
98
|
+
|
|
99
|
+
if clean.include?(".")
|
|
100
|
+
resolve_mixed_separators(clean)
|
|
101
|
+
else
|
|
102
|
+
resolve_comma_only(clean)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def resolve_mixed_separators(clean)
|
|
107
|
+
if clean.rindex(",") > clean.rindex(".")
|
|
108
|
+
clean.delete(".").tr(",", ".")
|
|
109
|
+
else
|
|
110
|
+
clean.delete(",")
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def resolve_comma_only(clean)
|
|
115
|
+
parts = clean.split(",")
|
|
116
|
+
if parts.size == 2
|
|
117
|
+
clean.tr(",", ".")
|
|
118
|
+
else
|
|
119
|
+
"#{parts[0...-1].join}.#{parts.last}"
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
private_class_method :clean_price_text, :resolve_currency, :strip_single_price,
|
|
124
|
+
:normalize_separators, :resolve_mixed_separators,
|
|
125
|
+
:resolve_comma_only
|
|
126
|
+
end
|
|
127
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Dir.glob(File.join(__dir__, "price_scanner", "*.rb")).each { |f| require_relative f }
|
|
4
|
+
|
|
5
|
+
# Multi-currency price extraction from text.
|
|
6
|
+
module PriceScanner
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def parse(text)
|
|
10
|
+
prices = Detector.extract_prices_from_text(text)
|
|
11
|
+
return nil if prices.empty?
|
|
12
|
+
|
|
13
|
+
build_result(prices.first)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def scan(text)
|
|
17
|
+
Detector.extract_prices_from_text(text).map { |price| build_result(price) }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def contains_price?(text)
|
|
21
|
+
Detector.contains_price?(text)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def build_result(price)
|
|
25
|
+
price_text = price[:text]
|
|
26
|
+
{ amount: price[:value], currency: Parser.extract_currency(price_text), text: price_text }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private_class_method :build_result
|
|
30
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: price_scanner
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Justyna
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies: []
|
|
12
|
+
description: Battle-tested price parser supporting PLN, EUR, GBP, USD. Extracts prices
|
|
13
|
+
from text, handles Polish and English number formats, filters savings badges and
|
|
14
|
+
price ranges.
|
|
15
|
+
email:
|
|
16
|
+
- justine84@gmail.com
|
|
17
|
+
executables: []
|
|
18
|
+
extensions: []
|
|
19
|
+
extra_rdoc_files: []
|
|
20
|
+
files:
|
|
21
|
+
- ".reek.yml"
|
|
22
|
+
- ".rspec"
|
|
23
|
+
- ".rubocop.yml"
|
|
24
|
+
- LICENSE
|
|
25
|
+
- README.md
|
|
26
|
+
- Rakefile
|
|
27
|
+
- lib/price_scanner.rb
|
|
28
|
+
- lib/price_scanner/consent_detector.rb
|
|
29
|
+
- lib/price_scanner/detector.rb
|
|
30
|
+
- lib/price_scanner/parser.rb
|
|
31
|
+
- lib/price_scanner/version.rb
|
|
32
|
+
homepage: https://github.com/justi/price_scanner
|
|
33
|
+
licenses:
|
|
34
|
+
- MIT
|
|
35
|
+
metadata:
|
|
36
|
+
homepage_uri: https://github.com/justi/price_scanner
|
|
37
|
+
source_code_uri: https://github.com/justi/price_scanner
|
|
38
|
+
changelog_uri: https://github.com/justi/price_scanner/blob/main/CHANGELOG.md
|
|
39
|
+
rubygems_mfa_required: 'true'
|
|
40
|
+
rdoc_options: []
|
|
41
|
+
require_paths:
|
|
42
|
+
- lib
|
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: 3.1.0
|
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
49
|
+
requirements:
|
|
50
|
+
- - ">="
|
|
51
|
+
- !ruby/object:Gem::Version
|
|
52
|
+
version: '0'
|
|
53
|
+
requirements: []
|
|
54
|
+
rubygems_version: 3.6.7
|
|
55
|
+
specification_version: 4
|
|
56
|
+
summary: Multi-currency price extraction from text
|
|
57
|
+
test_files: []
|