price_scanner 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/price_scanner/detector.rb +13 -4
- data/lib/price_scanner/parser.rb +3 -2
- data/lib/price_scanner/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fe537217d5cc6562f1f6198123b1ea1a9de43c61452abc9764f0bb695df3873b
|
|
4
|
+
data.tar.gz: 0cfeeb21a40fdfeeac475dffdf281d2740f5339465f1ddf45f9649632ffe5585
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5d6f7530f8339d8cdc023fd8dabb282ce64004d26d5698b8cce943e58ab50017ead499fae745e18189ed4f9604f84ad395a8fd1aa61d66cc588eba40b3844d29
|
|
7
|
+
data.tar.gz: 420cb69e1d0c16fa5a83cadb0897f7d96dc45836646207e0d97c706c503b1e56e02cf23b0f67507b6ec6f792c9a61c2c3dc8959a6e8c9a734220dc923cb20347
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.4
|
|
4
|
+
|
|
5
|
+
- Fix false promotions from duplicated price ranges in accessibility markup (e.g. WooCommerce `screen-reader-text`)
|
|
6
|
+
- Range filter now removes all prices whose values match detected range values, not just the directly connected pair
|
|
7
|
+
- Prevents "2,90€ – 16,90€" + "Plage de prix : 2,90€ à 16,90€" from producing a false 83% discount
|
|
8
|
+
|
|
3
9
|
## 0.3.0
|
|
4
10
|
|
|
5
11
|
- Add `include_per_unit:` option to `extract_prices_from_text` — allows including per-unit prices (`£46.00/M`, `29,99 zł/kg`) that are filtered by default
|
|
@@ -3,10 +3,13 @@
|
|
|
3
3
|
module PriceScanner
|
|
4
4
|
# Extracts prices from text using regex patterns with smart filtering.
|
|
5
5
|
module Detector
|
|
6
|
+
# Space chars used as thousand separators: regular space, NBSP (\u00a0), narrow NBSP (\u202f)
|
|
7
|
+
SP = "[\\s\\u00a0\\u202f]"
|
|
8
|
+
|
|
6
9
|
PRICE_PATTERN = /
|
|
7
|
-
(?:zł|pln|€|\$|£)
|
|
8
|
-
(?<![a-zA-Z\d])(?:\d{1,3}(?:[
|
|
9
|
-
(?<![a-zA-Z\d])(?:\d{1,3}(?:[
|
|
10
|
+
(?:zł|pln|€|\$|£)#{SP}*(?:\d{1,3}(?:[.,#{SP}]\d{3})+|\d{1,4})(?:[.,]\d{1,2})? |
|
|
11
|
+
(?<![a-zA-Z\d])(?:\d{1,3}(?:[.,#{SP}]\d{3})+|\d{1,4})[.,]\d{2}#{SP}*(?:zł|pln|€|\$|£|eur|usd|gbp)(?!\d) |
|
|
12
|
+
(?<![a-zA-Z\d])(?:\d{1,3}(?:[.,#{SP}]\d{3})+|\d{1,4})#{SP}*(?:zł|pln|€|\$|£)(?!\d)
|
|
10
13
|
/ix
|
|
11
14
|
|
|
12
15
|
PER_UNIT_PATTERN = %r{(?:/\s*|za\s+)(?:kg|g|mg|l|ml|szt|m[²³23]?|cm|mm|op|opak|pcs|pc|unit|each|ea|kaps|tabl|tab)\b}i
|
|
@@ -113,7 +116,13 @@ module PriceScanner
|
|
|
113
116
|
return prices if prices.size < MIN_PRICES_FOR_RANGE
|
|
114
117
|
|
|
115
118
|
range_indices = find_range_indices(prices, text)
|
|
116
|
-
prices
|
|
119
|
+
return prices if range_indices.empty?
|
|
120
|
+
|
|
121
|
+
# Remove range prices AND any duplicates with same values.
|
|
122
|
+
# Handles accessibility markup (e.g. screen-reader-text) that repeats
|
|
123
|
+
# range prices with non-dash separators like "à" or "to".
|
|
124
|
+
range_values = range_indices.map { |idx| prices[idx][:value] }.to_set
|
|
125
|
+
prices.reject.with_index { |price, idx| range_indices.include?(idx) || range_values.include?(price[:value]) }
|
|
117
126
|
end
|
|
118
127
|
|
|
119
128
|
def find_range_indices(prices, text)
|
data/lib/price_scanner/parser.rb
CHANGED
|
@@ -17,13 +17,14 @@ module PriceScanner
|
|
|
17
17
|
MULTIPLE_SPACES = /\s{2,}/
|
|
18
18
|
COLLAPSE_WHITESPACE = /\s+/
|
|
19
19
|
NBSP = "\u00a0"
|
|
20
|
+
NNBSP = "\u202f"
|
|
20
21
|
DECIMAL_PLACES = 2
|
|
21
22
|
THOUSANDS_GROUP = /.{1,3}/
|
|
22
23
|
|
|
23
24
|
module_function
|
|
24
25
|
|
|
25
26
|
def normalized_price(value)
|
|
26
|
-
text = value.to_s.tr(NBSP, "
|
|
27
|
+
text = value.to_s.tr(NBSP + NNBSP, " ").strip
|
|
27
28
|
return nil if text.empty?
|
|
28
29
|
|
|
29
30
|
clean = clean_price_text(text)
|
|
@@ -83,7 +84,7 @@ module PriceScanner
|
|
|
83
84
|
end
|
|
84
85
|
|
|
85
86
|
def strip_single_price(cleaned, price)
|
|
86
|
-
normalized = price.to_s.tr(NBSP, "
|
|
87
|
+
normalized = price.to_s.tr(NBSP + NNBSP, " ").strip
|
|
87
88
|
return cleaned if normalized.empty?
|
|
88
89
|
|
|
89
90
|
result = cleaned.gsub(normalized, "").gsub(normalized.delete(" "), "")
|