price_scanner 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5866ddf8fd84a9bc437fc7abb5cb6e137ce3d9f9f36d1ff778e1e8db668d0ce1
4
- data.tar.gz: 92ee758d982507e59b5a3da296d5390f6b1af276ebe52359e3718909b5186271
3
+ metadata.gz: 253513985856fa4e2c504df5157eefcf971e1f686fe3869ec5a93b047603294b
4
+ data.tar.gz: b098c087d64fc1ed716575576ce5cfd0b30d59b9d667858ac71a203ea42d4373
5
5
  SHA512:
6
- metadata.gz: 151ba2ea359f8e9bfe5c44b541de9c51e83c8316a8237ca682f00f1e042ac52c2a6328fd778541ab569a7bf577caed233e83d127acf822aabd493c7d519fa6c7
7
- data.tar.gz: 98e055152d925a8ed027f2baab2523d7e77804a6ea139985d296d3bb36273aa44450e80b8093ac18b23f1661b3350dafb62392948a04cdd869d31cf235d7b5b5
6
+ metadata.gz: 2f5cf01095f4ed5beb298262ce385ed43b3cadecbc130590c57a8dfac705ff0dcbe5a74a0534cb4426ef05aba8621d74bae99626709664949a48f31b9a88d432
7
+ data.tar.gz: 27d95b2ce089306059edb5606fc1acd8034dfbf7bb343e253e821edfc259f3389ec1fcc3bde08a1cd3dd25270da206e9bc569d53817a012b3e5e2714b9602f7e
data/CHANGELOG.md ADDED
@@ -0,0 +1,39 @@
1
+ # Changelog
2
+
3
+ ## 0.3.0
4
+
5
+ - Add `include_per_unit:` option to `extract_prices_from_text` — allows including per-unit prices (`£46.00/M`, `29,99 zł/kg`) that are filtered by default
6
+
7
+ ## 0.2.3
8
+
9
+ - Fix comma-as-thousands-separator not recognized in PRICE_PATTERN (`7,999.00 €` → was parsed as `999.00 €`)
10
+ - Affects prices in English/international format: `$1,299.99`, `8,289.00 €`, etc.
11
+ - Safe change: requires exactly 3 digits after separator, so decimal commas (`19,99 zł`) still work correctly
12
+
13
+ ## 0.2.2
14
+
15
+ - Fix negative price detection with spaced dash ("- 1.040 zł") — savings badges with space between minus and price were not filtered
16
+ - Refactor `negative_price?` with `rindex_non_space` helper (DRY)
17
+ - Distinguish range separators ("Pack of 3 - 29,99 zł") from negative prices
18
+
19
+ ## 0.2.1
20
+
21
+ - Fix false price extraction from model numbers (IP65, HC940, H265, 2K 30MP)
22
+ - Prevent digits before currency symbol from being matched as prices
23
+
24
+ ## 0.2.0
25
+
26
+ - Remove ConsentDetector from gem (moved to smart_offers app)
27
+
28
+ ## 0.1.1
29
+
30
+ - Remove rubycritic dependency
31
+ - Auto-require all price_scanner modules
32
+
33
+ ## 0.1.0
34
+
35
+ - Initial release
36
+ - `PriceScanner::Parser` — normalize prices, extract currency, strip price mentions
37
+ - `PriceScanner::Detector` — extract prices from text, filter negatives/per-unit/ranges/savings
38
+ - Multi-currency support: PLN, EUR, USD, GBP
39
+ - Smart filtering: negative prices, per-unit prices, price ranges, savings amounts
data/README.md CHANGED
@@ -134,7 +134,7 @@ PriceScanner.scan("Was 449,00 zł, now 349,00 zł. You save 100,00 zł!")
134
134
 
135
135
  ### Per-unit prices
136
136
 
137
- Prices followed by a unit indicator are filtered out.
137
+ Prices followed by a unit indicator are filtered out by default.
138
138
 
139
139
  Supported units: `kg`, `g`, `mg`, `l`, `ml`, `szt`, `m`, `m²`, `m³`, `cm`, `mm`, `op`, `opak`, `pcs`, `pc`, `unit`, `each`, `ea`, `kaps`, `tabl`, `tab`
140
140
 
@@ -146,6 +146,13 @@ PriceScanner.scan("32,74 zł/kg — buy 500g for 16,37 zł")
146
146
  # 32,74 zł/kg is excluded
147
147
  ```
148
148
 
149
+ For products priced exclusively per unit (e.g., fabrics sold per meter, bulk goods per kg), pass `include_per_unit: true`:
150
+
151
+ ```ruby
152
+ PriceScanner::Detector.extract_prices_from_text("£46.00/M £13.55/M", include_per_unit: true)
153
+ # => [{text: "£46.00/M", value: 46.0}, {text: "£13.55/M", value: 13.55}]
154
+ ```
155
+
149
156
  ### Deduplication
150
157
 
151
158
  If the same price value appears multiple times, only one occurrence is kept.
@@ -4,9 +4,9 @@ module PriceScanner
4
4
  # Extracts prices from text using regex patterns with smart filtering.
5
5
  module Detector
6
6
  PRICE_PATTERN = /
7
- (?:zł|pln|€|\$|£)[\s\u00a0]*(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})(?:[.,]\d{1,2})? |
8
- (?<![a-zA-Z\d])(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})[.,]\d{2}[\s\u00a0]*(?:zł|pln|€|\$|£|eur|usd|gbp)(?!\d) |
9
- (?<![a-zA-Z\d])(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})[\s\u00a0]*(?:zł|pln|€|\$|£)(?!\d)
7
+ (?:zł|pln|€|\$|£)[\s\u00a0]*(?:\d{1,3}(?:[.,\s\u00a0]\d{3})+|\d{1,4})(?:[.,]\d{1,2})? |
8
+ (?<![a-zA-Z\d])(?:\d{1,3}(?:[.,\s\u00a0]\d{3})+|\d{1,4})[.,]\d{2}[\s\u00a0]*(?:zł|pln|€|\$|£|eur|usd|gbp)(?!\d) |
9
+ (?<![a-zA-Z\d])(?:\d{1,3}(?:[.,\s\u00a0]\d{3})+|\d{1,4})[\s\u00a0]*(?:zł|pln|€|\$|£)(?!\d)
10
10
  /ix
11
11
 
12
12
  PER_UNIT_PATTERN = %r{(?:/\s*|za\s+)(?:kg|g|mg|l|ml|szt|m[²³23]?|cm|mm|op|opak|pcs|pc|unit|each|ea|kaps|tabl|tab)\b}i
@@ -26,9 +26,9 @@ module PriceScanner
26
26
 
27
27
  module_function
28
28
 
29
- def extract_prices_from_text(text)
29
+ def extract_prices_from_text(text, include_per_unit: false)
30
30
  text_str = text.to_s
31
- raw_prices = scan_raw_prices(text_str)
31
+ raw_prices = scan_raw_prices(text_str, include_per_unit: include_per_unit)
32
32
  filtered = filter_range_prices(raw_prices, text_str)
33
33
  unique = filtered.uniq { |price| price[:value] }
34
34
  filter_savings_by_difference(unique)
@@ -38,34 +38,34 @@ module PriceScanner
38
38
  text.to_s.match?(PRICE_PATTERN)
39
39
  end
40
40
 
41
- def scan_raw_prices(text_str)
41
+ def scan_raw_prices(text_str, include_per_unit: false)
42
42
  results = []
43
43
  last_end = 0
44
44
 
45
45
  text_str.scan(PRICE_PATTERN) do |match_str|
46
- result, last_end = find_price_at(text_str, match_str, last_end)
46
+ result, last_end = find_price_at(text_str, match_str, last_end, include_per_unit: include_per_unit)
47
47
  results << result if result
48
48
  end
49
49
 
50
50
  results
51
51
  end
52
52
 
53
- def find_price_at(text_str, match_str, search_from)
53
+ def find_price_at(text_str, match_str, search_from, include_per_unit: false)
54
54
  return [nil, search_from] if match_str.empty?
55
55
 
56
56
  match_index = text_str.index(match_str, search_from)
57
57
  return [nil, search_from] unless match_index
58
58
 
59
59
  match_end = match_index + match_str.length
60
- [build_price_result(text_str, match_str, match_index), match_end]
60
+ [build_price_result(text_str, match_str, match_index, include_per_unit: include_per_unit), match_end]
61
61
  end
62
62
 
63
- def build_price_result(text_str, match_str, match_index)
63
+ def build_price_result(text_str, match_str, match_index, include_per_unit: false)
64
64
  value = Parser.normalized_price(match_str)
65
65
  return unless value
66
66
 
67
67
  return if negative_price?(text_str, match_index)
68
- return if per_unit_price?(text_str, match_index + match_str.length)
68
+ return if !include_per_unit && per_unit_price?(text_str, match_index + match_str.length)
69
69
 
70
70
  clean_text = match_str.gsub(Parser::COLLAPSE_WHITESPACE, " ").strip
71
71
  { text: clean_text, value: value, position: match_index }
@@ -74,23 +74,19 @@ module PriceScanner
74
74
  def negative_price?(text_str, match_index)
75
75
  return false unless match_index.positive?
76
76
 
77
- # Direct prefix: "-1.040,00 "
78
- return true if NEGATIVE_PREFIXES.include?(text_str[match_index - 1])
77
+ # Find the non-whitespace char before price: "-1.040" or "- 1.040"
78
+ dash_pos = rindex_non_space(text_str, match_index - 1)
79
+ return false unless dash_pos && NEGATIVE_PREFIXES.include?(text_str[dash_pos])
79
80
 
80
- # Spaced prefix: "- 1.040 zł" only when dash is at start or preceded by non-digit
81
- # "Pack of 3 - 29,99 zł" → dash after digit = range separator, not negative
82
- i = match_index - 1
83
- i -= 1 while i >= 0 && text_str[i] =~ /\s/
84
- return false unless i >= 0 && NEGATIVE_PREFIXES.include?(text_str[i])
85
-
86
- # Dash at start of text = negative
87
- return true if i == 0
81
+ # Dash at start of text = negative; after digit = range separator ("3 - 29,99")
82
+ before_dash = rindex_non_space(text_str, dash_pos - 1)
83
+ before_dash.nil? || text_str[before_dash] !~ /\d/
84
+ end
88
85
 
89
- # Check what's before the dash (skip whitespace)
90
- j = i - 1
91
- j -= 1 while j >= 0 && text_str[j] =~ /\s/
92
- # Dash after digit = range separator ("3 - 29,99"), not negative
93
- j < 0 || text_str[j] !~ /\d/
86
+ def rindex_non_space(text_str, from)
87
+ i = from
88
+ i -= 1 while i >= 0 && text_str[i] =~ /\s/
89
+ i >= 0 ? i : nil
94
90
  end
95
91
 
96
92
  def per_unit_price?(text_str, match_end)
@@ -151,7 +147,7 @@ module PriceScanner
151
147
  end
152
148
 
153
149
  private_class_method :scan_raw_prices, :find_price_at, :build_price_result,
154
- :negative_price?, :per_unit_price?,
150
+ :negative_price?, :rindex_non_space, :per_unit_price?,
155
151
  :filter_range_prices, :find_range_indices, :range_between?,
156
152
  :filter_savings_by_difference, :savings_amount?, :matches_savings_pattern?
157
153
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PriceScanner
4
- VERSION = "0.2.2"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: price_scanner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justyna
@@ -21,6 +21,7 @@ files:
21
21
  - ".reek.yml"
22
22
  - ".rspec"
23
23
  - ".rubocop.yml"
24
+ - CHANGELOG.md
24
25
  - LICENSE
25
26
  - README.md
26
27
  - Rakefile