price_scanner 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2014de33dd81d654b4fd82992474605dd110ff5154299dec1bfca80ec17f91db
4
- data.tar.gz: b341b9072d034078d95c8a335d0d8f9d9b62bd2d00dc4379441a48cebe3091b9
3
+ metadata.gz: 933c597623a0441a0554f9cc3b408134180022e69788f15b4e3d785fcf22ab83
4
+ data.tar.gz: 8dc569cea4f591cd1cd11db0eba535a1b166f9913e9e6069155b1cdf08421709
5
5
  SHA512:
6
- metadata.gz: edffb337ea738fa8541612a49d1a2417f40c4d3463ef4b0509eef224e34172278f1ea585b2c34e470aea0feb457540d7be2218a4fdc97f2fb2e67e8c75244190
7
- data.tar.gz: e0a1d51e6a441400d41c6c77c25ca9a5d083ec8e2d758a76685321281e4e14c7bf0a05d3bbac27cd60ecc04e0babf8f2bcd0aad23e471246fcbf6f957fcc890c
6
+ metadata.gz: 6bafb309f4ef2d2296cc7981bf44bbf8eb42e185f4995809cc0e8aa4e0aa80ce25299bda7996892d42c03a225f4b961e4c8d7dcdc98f8329145c7e09ce6a6314
7
+ data.tar.gz: 48fce8a5a51cacb32fe062b380da562e72dcab7a44e2c5ea4bc0a3544bb3f854257c4b86491e9d6cccd4dc2ae056329092bad4189c5434ab6e2873718e137aed
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.0
4
+
5
+ - Add `include_per_unit:` option to `extract_prices_from_text` — allows including per-unit prices (`£46.00/M`, `29,99 zł/kg`) that are filtered by default
6
+
3
7
  ## 0.2.3
4
8
 
5
9
  - Fix comma-as-thousands-separator not recognized in PRICE_PATTERN (`7,999.00 €` → was parsed as `999.00 €`)
data/README.md CHANGED
@@ -134,7 +134,7 @@ PriceScanner.scan("Was 449,00 zł, now 349,00 zł. You save 100,00 zł!")
134
134
 
135
135
  ### Per-unit prices
136
136
 
137
- Prices followed by a unit indicator are filtered out.
137
+ Prices followed by a unit indicator are filtered out by default.
138
138
 
139
139
  Supported units: `kg`, `g`, `mg`, `l`, `ml`, `szt`, `m`, `m²`, `m³`, `cm`, `mm`, `op`, `opak`, `pcs`, `pc`, `unit`, `each`, `ea`, `kaps`, `tabl`, `tab`
140
140
 
@@ -146,6 +146,13 @@ PriceScanner.scan("32,74 zł/kg — buy 500g for 16,37 zł")
146
146
  # 32,74 zł/kg is excluded
147
147
  ```
148
148
 
149
+ For products priced exclusively per unit (e.g., fabrics sold per meter, bulk goods per kg), pass `include_per_unit: true`:
150
+
151
+ ```ruby
152
+ PriceScanner::Detector.extract_prices_from_text("£46.00/M £13.55/M", include_per_unit: true)
153
+ # => [{text: "£46.00/M", value: 46.0}, {text: "£13.55/M", value: 13.55}]
154
+ ```
155
+
149
156
  ### Deduplication
150
157
 
151
158
  If the same price value appears multiple times, only one occurrence is kept.
@@ -14,6 +14,10 @@ module PriceScanner
14
14
 
15
15
  NEGATIVE_PREFIXES = ["-", "\u2212"].freeze
16
16
 
17
+ # Prefixes that indicate the following price is a savings amount, not a product price.
18
+ # "Oszczędzasz 6.40 PLN" = "You save 6.40 PLN" — not the product price.
19
+ SAVINGS_PREFIX_PATTERN = /(?:oszcz[eę]dzasz|zaoszcz[eę]d[zź]|savings?|you\s+save|rabat|discount|remise|risparmio|ahorro|sparen|sie\s+sparen)\s*:?\s*\z/i
20
+
17
21
  RANGE_SEPARATOR_PATTERN = /\s*[–—]\s*|\s+-\s+/
18
22
 
19
23
  TEXT_AFTER_LOOKAHEAD = 200
@@ -26,9 +30,9 @@ module PriceScanner
26
30
 
27
31
  module_function
28
32
 
29
- def extract_prices_from_text(text)
33
+ def extract_prices_from_text(text, include_per_unit: false)
30
34
  text_str = text.to_s
31
- raw_prices = scan_raw_prices(text_str)
35
+ raw_prices = scan_raw_prices(text_str, include_per_unit: include_per_unit)
32
36
  filtered = filter_range_prices(raw_prices, text_str)
33
37
  unique = filtered.uniq { |price| price[:value] }
34
38
  filter_savings_by_difference(unique)
@@ -38,34 +42,35 @@ module PriceScanner
38
42
  text.to_s.match?(PRICE_PATTERN)
39
43
  end
40
44
 
41
- def scan_raw_prices(text_str)
45
+ def scan_raw_prices(text_str, include_per_unit: false)
42
46
  results = []
43
47
  last_end = 0
44
48
 
45
49
  text_str.scan(PRICE_PATTERN) do |match_str|
46
- result, last_end = find_price_at(text_str, match_str, last_end)
50
+ result, last_end = find_price_at(text_str, match_str, last_end, include_per_unit: include_per_unit)
47
51
  results << result if result
48
52
  end
49
53
 
50
54
  results
51
55
  end
52
56
 
53
- def find_price_at(text_str, match_str, search_from)
57
+ def find_price_at(text_str, match_str, search_from, include_per_unit: false)
54
58
  return [nil, search_from] if match_str.empty?
55
59
 
56
60
  match_index = text_str.index(match_str, search_from)
57
61
  return [nil, search_from] unless match_index
58
62
 
59
63
  match_end = match_index + match_str.length
60
- [build_price_result(text_str, match_str, match_index), match_end]
64
+ [build_price_result(text_str, match_str, match_index, include_per_unit: include_per_unit), match_end]
61
65
  end
62
66
 
63
- def build_price_result(text_str, match_str, match_index)
67
+ def build_price_result(text_str, match_str, match_index, include_per_unit: false)
64
68
  value = Parser.normalized_price(match_str)
65
69
  return unless value
66
70
 
67
71
  return if negative_price?(text_str, match_index)
68
- return if per_unit_price?(text_str, match_index + match_str.length)
72
+ return if savings_prefix?(text_str, match_index)
73
+ return if !include_per_unit && per_unit_price?(text_str, match_index + match_str.length)
69
74
 
70
75
  clean_text = match_str.gsub(Parser::COLLAPSE_WHITESPACE, " ").strip
71
76
  { text: clean_text, value: value, position: match_index }
@@ -89,6 +94,16 @@ module PriceScanner
89
94
  i >= 0 ? i : nil
90
95
  end
91
96
 
97
+ # Check if text before the price contains a savings prefix like "Oszczędzasz" or "You save"
98
+ def savings_prefix?(text_str, match_index)
99
+ return false unless match_index > 3
100
+
101
+ # Look at up to 30 chars before the price match
102
+ lookback_start = [match_index - 30, 0].max
103
+ text_before = text_str[lookback_start...match_index]
104
+ text_before.match?(SAVINGS_PREFIX_PATTERN)
105
+ end
106
+
92
107
  def per_unit_price?(text_str, match_end)
93
108
  text_after = text_str[match_end, TEXT_AFTER_LOOKAHEAD].to_s.gsub(Parser::COLLAPSE_WHITESPACE, " ").lstrip
94
109
  text_after.match?(PER_UNIT_ANCHOR)
@@ -147,7 +162,7 @@ module PriceScanner
147
162
  end
148
163
 
149
164
  private_class_method :scan_raw_prices, :find_price_at, :build_price_result,
150
- :negative_price?, :rindex_non_space, :per_unit_price?,
165
+ :negative_price?, :rindex_non_space, :savings_prefix?, :per_unit_price?,
151
166
  :filter_range_prices, :find_range_indices, :range_between?,
152
167
  :filter_savings_by_difference, :savings_amount?, :matches_savings_pattern?
153
168
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PriceScanner
4
- VERSION = "0.2.3"
4
+ VERSION = "0.3.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: price_scanner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justyna