price_scanner 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -0
- data/README.md +8 -1
- data/lib/price_scanner/detector.rb +23 -27
- data/lib/price_scanner/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 253513985856fa4e2c504df5157eefcf971e1f686fe3869ec5a93b047603294b
|
|
4
|
+
data.tar.gz: b098c087d64fc1ed716575576ce5cfd0b30d59b9d667858ac71a203ea42d4373
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2f5cf01095f4ed5beb298262ce385ed43b3cadecbc130590c57a8dfac705ff0dcbe5a74a0534cb4426ef05aba8621d74bae99626709664949a48f31b9a88d432
|
|
7
|
+
data.tar.gz: 27d95b2ce089306059edb5606fc1acd8034dfbf7bb343e253e821edfc259f3389ec1fcc3bde08a1cd3dd25270da206e9bc569d53817a012b3e5e2714b9602f7e
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.3.0
|
|
4
|
+
|
|
5
|
+
- Add `include_per_unit:` option to `extract_prices_from_text` — allows including per-unit prices (`£46.00/M`, `29,99 zł/kg`) that are filtered by default
|
|
6
|
+
|
|
7
|
+
## 0.2.3
|
|
8
|
+
|
|
9
|
+
- Fix comma-as-thousands-separator not recognized in PRICE_PATTERN (`7,999.00 €` → was parsed as `999.00 €`)
|
|
10
|
+
- Affects prices in English/international format: `$1,299.99`, `8,289.00 €`, etc.
|
|
11
|
+
- Safe change: requires exactly 3 digits after separator, so decimal commas (`19,99 zł`) still work correctly
|
|
12
|
+
|
|
13
|
+
## 0.2.2
|
|
14
|
+
|
|
15
|
+
- Fix negative price detection with spaced dash ("- 1.040 zł") — savings badges with space between minus and price were not filtered
|
|
16
|
+
- Refactor `negative_price?` with `rindex_non_space` helper (DRY)
|
|
17
|
+
- Distinguish range separators ("Pack of 3 - 29,99 zł") from negative prices
|
|
18
|
+
|
|
19
|
+
## 0.2.1
|
|
20
|
+
|
|
21
|
+
- Fix false price extraction from model numbers (IP65, HC940, H265, 2K 30MP)
|
|
22
|
+
- Prevent digits before currency symbol from being matched as prices
|
|
23
|
+
|
|
24
|
+
## 0.2.0
|
|
25
|
+
|
|
26
|
+
- Remove ConsentDetector from gem (moved to smart_offers app)
|
|
27
|
+
|
|
28
|
+
## 0.1.1
|
|
29
|
+
|
|
30
|
+
- Remove rubycritic dependency
|
|
31
|
+
- Auto-require all price_scanner modules
|
|
32
|
+
|
|
33
|
+
## 0.1.0
|
|
34
|
+
|
|
35
|
+
- Initial release
|
|
36
|
+
- `PriceScanner::Parser` — normalize prices, extract currency, strip price mentions
|
|
37
|
+
- `PriceScanner::Detector` — extract prices from text, filter negatives/per-unit/ranges/savings
|
|
38
|
+
- Multi-currency support: PLN, EUR, USD, GBP
|
|
39
|
+
- Smart filtering: negative prices, per-unit prices, price ranges, savings amounts
|
data/README.md
CHANGED
|
@@ -134,7 +134,7 @@ PriceScanner.scan("Was 449,00 zł, now 349,00 zł. You save 100,00 zł!")
|
|
|
134
134
|
|
|
135
135
|
### Per-unit prices
|
|
136
136
|
|
|
137
|
-
Prices followed by a unit indicator are filtered out.
|
|
137
|
+
Prices followed by a unit indicator are filtered out by default.
|
|
138
138
|
|
|
139
139
|
Supported units: `kg`, `g`, `mg`, `l`, `ml`, `szt`, `m`, `m²`, `m³`, `cm`, `mm`, `op`, `opak`, `pcs`, `pc`, `unit`, `each`, `ea`, `kaps`, `tabl`, `tab`
|
|
140
140
|
|
|
@@ -146,6 +146,13 @@ PriceScanner.scan("32,74 zł/kg — buy 500g for 16,37 zł")
|
|
|
146
146
|
# 32,74 zł/kg is excluded
|
|
147
147
|
```
|
|
148
148
|
|
|
149
|
+
For products priced exclusively per unit (e.g., fabrics sold per meter, bulk goods per kg), pass `include_per_unit: true`:
|
|
150
|
+
|
|
151
|
+
```ruby
|
|
152
|
+
PriceScanner::Detector.extract_prices_from_text("£46.00/M £13.55/M", include_per_unit: true)
|
|
153
|
+
# => [{text: "£46.00/M", value: 46.0}, {text: "£13.55/M", value: 13.55}]
|
|
154
|
+
```
|
|
155
|
+
|
|
149
156
|
### Deduplication
|
|
150
157
|
|
|
151
158
|
If the same price value appears multiple times, only one occurrence is kept.
|
|
@@ -4,9 +4,9 @@ module PriceScanner
|
|
|
4
4
|
# Extracts prices from text using regex patterns with smart filtering.
|
|
5
5
|
module Detector
|
|
6
6
|
PRICE_PATTERN = /
|
|
7
|
-
(?:zł|pln|€|\$|£)[\s\u00a0]*(?:\d{1,3}(?:[
|
|
8
|
-
(?<![a-zA-Z\d])(?:\d{1,3}(?:[
|
|
9
|
-
(?<![a-zA-Z\d])(?:\d{1,3}(?:[
|
|
7
|
+
(?:zł|pln|€|\$|£)[\s\u00a0]*(?:\d{1,3}(?:[.,\s\u00a0]\d{3})+|\d{1,4})(?:[.,]\d{1,2})? |
|
|
8
|
+
(?<![a-zA-Z\d])(?:\d{1,3}(?:[.,\s\u00a0]\d{3})+|\d{1,4})[.,]\d{2}[\s\u00a0]*(?:zł|pln|€|\$|£|eur|usd|gbp)(?!\d) |
|
|
9
|
+
(?<![a-zA-Z\d])(?:\d{1,3}(?:[.,\s\u00a0]\d{3})+|\d{1,4})[\s\u00a0]*(?:zł|pln|€|\$|£)(?!\d)
|
|
10
10
|
/ix
|
|
11
11
|
|
|
12
12
|
PER_UNIT_PATTERN = %r{(?:/\s*|za\s+)(?:kg|g|mg|l|ml|szt|m[²³23]?|cm|mm|op|opak|pcs|pc|unit|each|ea|kaps|tabl|tab)\b}i
|
|
@@ -26,9 +26,9 @@ module PriceScanner
|
|
|
26
26
|
|
|
27
27
|
module_function
|
|
28
28
|
|
|
29
|
-
def extract_prices_from_text(text)
|
|
29
|
+
def extract_prices_from_text(text, include_per_unit: false)
|
|
30
30
|
text_str = text.to_s
|
|
31
|
-
raw_prices = scan_raw_prices(text_str)
|
|
31
|
+
raw_prices = scan_raw_prices(text_str, include_per_unit: include_per_unit)
|
|
32
32
|
filtered = filter_range_prices(raw_prices, text_str)
|
|
33
33
|
unique = filtered.uniq { |price| price[:value] }
|
|
34
34
|
filter_savings_by_difference(unique)
|
|
@@ -38,34 +38,34 @@ module PriceScanner
|
|
|
38
38
|
text.to_s.match?(PRICE_PATTERN)
|
|
39
39
|
end
|
|
40
40
|
|
|
41
|
-
def scan_raw_prices(text_str)
|
|
41
|
+
def scan_raw_prices(text_str, include_per_unit: false)
|
|
42
42
|
results = []
|
|
43
43
|
last_end = 0
|
|
44
44
|
|
|
45
45
|
text_str.scan(PRICE_PATTERN) do |match_str|
|
|
46
|
-
result, last_end = find_price_at(text_str, match_str, last_end)
|
|
46
|
+
result, last_end = find_price_at(text_str, match_str, last_end, include_per_unit: include_per_unit)
|
|
47
47
|
results << result if result
|
|
48
48
|
end
|
|
49
49
|
|
|
50
50
|
results
|
|
51
51
|
end
|
|
52
52
|
|
|
53
|
-
def find_price_at(text_str, match_str, search_from)
|
|
53
|
+
def find_price_at(text_str, match_str, search_from, include_per_unit: false)
|
|
54
54
|
return [nil, search_from] if match_str.empty?
|
|
55
55
|
|
|
56
56
|
match_index = text_str.index(match_str, search_from)
|
|
57
57
|
return [nil, search_from] unless match_index
|
|
58
58
|
|
|
59
59
|
match_end = match_index + match_str.length
|
|
60
|
-
[build_price_result(text_str, match_str, match_index), match_end]
|
|
60
|
+
[build_price_result(text_str, match_str, match_index, include_per_unit: include_per_unit), match_end]
|
|
61
61
|
end
|
|
62
62
|
|
|
63
|
-
def build_price_result(text_str, match_str, match_index)
|
|
63
|
+
def build_price_result(text_str, match_str, match_index, include_per_unit: false)
|
|
64
64
|
value = Parser.normalized_price(match_str)
|
|
65
65
|
return unless value
|
|
66
66
|
|
|
67
67
|
return if negative_price?(text_str, match_index)
|
|
68
|
-
return if per_unit_price?(text_str, match_index + match_str.length)
|
|
68
|
+
return if !include_per_unit && per_unit_price?(text_str, match_index + match_str.length)
|
|
69
69
|
|
|
70
70
|
clean_text = match_str.gsub(Parser::COLLAPSE_WHITESPACE, " ").strip
|
|
71
71
|
{ text: clean_text, value: value, position: match_index }
|
|
@@ -74,23 +74,19 @@ module PriceScanner
|
|
|
74
74
|
def negative_price?(text_str, match_index)
|
|
75
75
|
return false unless match_index.positive?
|
|
76
76
|
|
|
77
|
-
#
|
|
78
|
-
|
|
77
|
+
# Find the non-whitespace char before price: "-1.040" or "- 1.040"
|
|
78
|
+
dash_pos = rindex_non_space(text_str, match_index - 1)
|
|
79
|
+
return false unless dash_pos && NEGATIVE_PREFIXES.include?(text_str[dash_pos])
|
|
79
80
|
|
|
80
|
-
#
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
return false unless i >= 0 && NEGATIVE_PREFIXES.include?(text_str[i])
|
|
85
|
-
|
|
86
|
-
# Dash at start of text = negative
|
|
87
|
-
return true if i == 0
|
|
81
|
+
# Dash at start of text = negative; after digit = range separator ("3 - 29,99")
|
|
82
|
+
before_dash = rindex_non_space(text_str, dash_pos - 1)
|
|
83
|
+
before_dash.nil? || text_str[before_dash] !~ /\d/
|
|
84
|
+
end
|
|
88
85
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
j < 0 || text_str[j] !~ /\d/
|
|
86
|
+
def rindex_non_space(text_str, from)
|
|
87
|
+
i = from
|
|
88
|
+
i -= 1 while i >= 0 && text_str[i] =~ /\s/
|
|
89
|
+
i >= 0 ? i : nil
|
|
94
90
|
end
|
|
95
91
|
|
|
96
92
|
def per_unit_price?(text_str, match_end)
|
|
@@ -151,7 +147,7 @@ module PriceScanner
|
|
|
151
147
|
end
|
|
152
148
|
|
|
153
149
|
private_class_method :scan_raw_prices, :find_price_at, :build_price_result,
|
|
154
|
-
:negative_price?, :per_unit_price?,
|
|
150
|
+
:negative_price?, :rindex_non_space, :per_unit_price?,
|
|
155
151
|
:filter_range_prices, :find_range_indices, :range_between?,
|
|
156
152
|
:filter_savings_by_difference, :savings_amount?, :matches_savings_pattern?
|
|
157
153
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: price_scanner
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Justyna
|
|
@@ -21,6 +21,7 @@ files:
|
|
|
21
21
|
- ".reek.yml"
|
|
22
22
|
- ".rspec"
|
|
23
23
|
- ".rubocop.yml"
|
|
24
|
+
- CHANGELOG.md
|
|
24
25
|
- LICENSE
|
|
25
26
|
- README.md
|
|
26
27
|
- Rakefile
|