price_scanner 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +8 -1
- data/lib/price_scanner/detector.rb +24 -9
- data/lib/price_scanner/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 933c597623a0441a0554f9cc3b408134180022e69788f15b4e3d785fcf22ab83
|
|
4
|
+
data.tar.gz: 8dc569cea4f591cd1cd11db0eba535a1b166f9913e9e6069155b1cdf08421709
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6bafb309f4ef2d2296cc7981bf44bbf8eb42e185f4995809cc0e8aa4e0aa80ce25299bda7996892d42c03a225f4b961e4c8d7dcdc98f8329145c7e09ce6a6314
|
|
7
|
+
data.tar.gz: 48fce8a5a51cacb32fe062b380da562e72dcab7a44e2c5ea4bc0a3544bb3f854257c4b86491e9d6cccd4dc2ae056329092bad4189c5434ab6e2873718e137aed
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.0
|
|
4
|
+
|
|
5
|
+
- Add `include_per_unit:` option to `extract_prices_from_text` — allows including per-unit prices (`£46.00/M`, `29,99 zł/kg`) that are filtered by default
|
|
6
|
+
|
|
3
7
|
## 0.2.3
|
|
4
8
|
|
|
5
9
|
- Fix comma-as-thousands-separator not recognized in PRICE_PATTERN (`7,999.00 €` → was parsed as `999.00 €`)
|
data/README.md
CHANGED
|
@@ -134,7 +134,7 @@ PriceScanner.scan("Was 449,00 zł, now 349,00 zł. You save 100,00 zł!")
|
|
|
134
134
|
|
|
135
135
|
### Per-unit prices
|
|
136
136
|
|
|
137
|
-
Prices followed by a unit indicator are filtered out.
|
|
137
|
+
Prices followed by a unit indicator are filtered out by default.
|
|
138
138
|
|
|
139
139
|
Supported units: `kg`, `g`, `mg`, `l`, `ml`, `szt`, `m`, `m²`, `m³`, `cm`, `mm`, `op`, `opak`, `pcs`, `pc`, `unit`, `each`, `ea`, `kaps`, `tabl`, `tab`
|
|
140
140
|
|
|
@@ -146,6 +146,13 @@ PriceScanner.scan("32,74 zł/kg — buy 500g for 16,37 zł")
|
|
|
146
146
|
# 32,74 zł/kg is excluded
|
|
147
147
|
```
|
|
148
148
|
|
|
149
|
+
For products priced exclusively per unit (e.g., fabrics sold per meter, bulk goods per kg), pass `include_per_unit: true`:
|
|
150
|
+
|
|
151
|
+
```ruby
|
|
152
|
+
PriceScanner::Detector.extract_prices_from_text("£46.00/M £13.55/M", include_per_unit: true)
|
|
153
|
+
# => [{text: "£46.00/M", value: 46.0}, {text: "£13.55/M", value: 13.55}]
|
|
154
|
+
```
|
|
155
|
+
|
|
149
156
|
### Deduplication
|
|
150
157
|
|
|
151
158
|
If the same price value appears multiple times, only one occurrence is kept.
|
|
@@ -14,6 +14,10 @@ module PriceScanner
|
|
|
14
14
|
|
|
15
15
|
NEGATIVE_PREFIXES = ["-", "\u2212"].freeze
|
|
16
16
|
|
|
17
|
+
# Prefixes that indicate the following price is a savings amount, not a product price.
|
|
18
|
+
# "Oszczędzasz 6.40 PLN" = "You save 6.40 PLN" — not the product price.
|
|
19
|
+
SAVINGS_PREFIX_PATTERN = /(?:oszcz[eę]dzasz|zaoszcz[eę]d[zź]|savings?|you\s+save|rabat|discount|remise|risparmio|ahorro|sparen|sie\s+sparen)\s*:?\s*\z/i
|
|
20
|
+
|
|
17
21
|
RANGE_SEPARATOR_PATTERN = /\s*[–—]\s*|\s+-\s+/
|
|
18
22
|
|
|
19
23
|
TEXT_AFTER_LOOKAHEAD = 200
|
|
@@ -26,9 +30,9 @@ module PriceScanner
|
|
|
26
30
|
|
|
27
31
|
module_function
|
|
28
32
|
|
|
29
|
-
def extract_prices_from_text(text)
|
|
33
|
+
def extract_prices_from_text(text, include_per_unit: false)
|
|
30
34
|
text_str = text.to_s
|
|
31
|
-
raw_prices = scan_raw_prices(text_str)
|
|
35
|
+
raw_prices = scan_raw_prices(text_str, include_per_unit: include_per_unit)
|
|
32
36
|
filtered = filter_range_prices(raw_prices, text_str)
|
|
33
37
|
unique = filtered.uniq { |price| price[:value] }
|
|
34
38
|
filter_savings_by_difference(unique)
|
|
@@ -38,34 +42,35 @@ module PriceScanner
|
|
|
38
42
|
text.to_s.match?(PRICE_PATTERN)
|
|
39
43
|
end
|
|
40
44
|
|
|
41
|
-
def scan_raw_prices(text_str)
|
|
45
|
+
def scan_raw_prices(text_str, include_per_unit: false)
|
|
42
46
|
results = []
|
|
43
47
|
last_end = 0
|
|
44
48
|
|
|
45
49
|
text_str.scan(PRICE_PATTERN) do |match_str|
|
|
46
|
-
result, last_end = find_price_at(text_str, match_str, last_end)
|
|
50
|
+
result, last_end = find_price_at(text_str, match_str, last_end, include_per_unit: include_per_unit)
|
|
47
51
|
results << result if result
|
|
48
52
|
end
|
|
49
53
|
|
|
50
54
|
results
|
|
51
55
|
end
|
|
52
56
|
|
|
53
|
-
def find_price_at(text_str, match_str, search_from)
|
|
57
|
+
def find_price_at(text_str, match_str, search_from, include_per_unit: false)
|
|
54
58
|
return [nil, search_from] if match_str.empty?
|
|
55
59
|
|
|
56
60
|
match_index = text_str.index(match_str, search_from)
|
|
57
61
|
return [nil, search_from] unless match_index
|
|
58
62
|
|
|
59
63
|
match_end = match_index + match_str.length
|
|
60
|
-
[build_price_result(text_str, match_str, match_index), match_end]
|
|
64
|
+
[build_price_result(text_str, match_str, match_index, include_per_unit: include_per_unit), match_end]
|
|
61
65
|
end
|
|
62
66
|
|
|
63
|
-
def build_price_result(text_str, match_str, match_index)
|
|
67
|
+
def build_price_result(text_str, match_str, match_index, include_per_unit: false)
|
|
64
68
|
value = Parser.normalized_price(match_str)
|
|
65
69
|
return unless value
|
|
66
70
|
|
|
67
71
|
return if negative_price?(text_str, match_index)
|
|
68
|
-
return if
|
|
72
|
+
return if savings_prefix?(text_str, match_index)
|
|
73
|
+
return if !include_per_unit && per_unit_price?(text_str, match_index + match_str.length)
|
|
69
74
|
|
|
70
75
|
clean_text = match_str.gsub(Parser::COLLAPSE_WHITESPACE, " ").strip
|
|
71
76
|
{ text: clean_text, value: value, position: match_index }
|
|
@@ -89,6 +94,16 @@ module PriceScanner
|
|
|
89
94
|
i >= 0 ? i : nil
|
|
90
95
|
end
|
|
91
96
|
|
|
97
|
+
# Check if text before the price contains a savings prefix like "Oszczędzasz" or "You save"
|
|
98
|
+
def savings_prefix?(text_str, match_index)
|
|
99
|
+
return false unless match_index > 3
|
|
100
|
+
|
|
101
|
+
# Look at up to 30 chars before the price match
|
|
102
|
+
lookback_start = [match_index - 30, 0].max
|
|
103
|
+
text_before = text_str[lookback_start...match_index]
|
|
104
|
+
text_before.match?(SAVINGS_PREFIX_PATTERN)
|
|
105
|
+
end
|
|
106
|
+
|
|
92
107
|
def per_unit_price?(text_str, match_end)
|
|
93
108
|
text_after = text_str[match_end, TEXT_AFTER_LOOKAHEAD].to_s.gsub(Parser::COLLAPSE_WHITESPACE, " ").lstrip
|
|
94
109
|
text_after.match?(PER_UNIT_ANCHOR)
|
|
@@ -147,7 +162,7 @@ module PriceScanner
|
|
|
147
162
|
end
|
|
148
163
|
|
|
149
164
|
private_class_method :scan_raw_prices, :find_price_at, :build_price_result,
|
|
150
|
-
:negative_price?, :rindex_non_space, :per_unit_price?,
|
|
165
|
+
:negative_price?, :rindex_non_space, :savings_prefix?, :per_unit_price?,
|
|
151
166
|
:filter_range_prices, :find_range_indices, :range_between?,
|
|
152
167
|
:filter_savings_by_difference, :savings_amount?, :matches_savings_pattern?
|
|
153
168
|
end
|