price_scanner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9402118514fcd9aff9b7e4ea1b56d53e95f92c8cb93b8d58d8769f8ee9e5608c
4
+ data.tar.gz: 35dd2b7a9d8c96e1d21214813bfe368ded2e1dce42cf0f45d1bdc52c6ce12345
5
+ SHA512:
6
+ metadata.gz: d3911479ea0209ef0caf8584cfb3f09d21567be2322ee4950b560b121364e66b7038771da64d329d31bc6f664c0a0149dac662748e362e00159a3350bcd7654d
7
+ data.tar.gz: 48b172e116ce23cfc3607517bf21d179f0758e02ab4c51fb6043ff2fc64d54314548daa2d0030d8015a572979dc31bd0e4bf9749d4548423fa7636ec5742749f
data/.reek.yml ADDED
@@ -0,0 +1,6 @@
1
+ ---
2
+ detectors:
3
+ TooManyConstants:
4
+ enabled: false
5
+ TooManyStatements:
6
+ max_statements: 7
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,53 @@
1
+ AllCops:
2
+ TargetRubyVersion: 3.1
3
+ NewCops: enable
4
+
5
+ require:
6
+ - rubocop-rspec
7
+ - rubocop-rake
8
+
9
+ Style/StringLiterals:
10
+ EnforcedStyle: double_quotes
11
+
12
+ Style/StringLiteralsInInterpolation:
13
+ EnforcedStyle: double_quotes
14
+
15
+ Layout/LineLength:
16
+ Max: 140
17
+
18
+ Metrics/BlockLength:
19
+ Exclude:
20
+ - "spec/**/*"
21
+
22
+ Metrics/ClassLength:
23
+ Max: 150
24
+
25
+ Metrics/MethodLength:
26
+ Max: 30
27
+
28
+ Metrics/ModuleLength:
29
+ Max: 150
30
+
31
+ Metrics/AbcSize:
32
+ Max: 35
33
+
34
+ Metrics/CyclomaticComplexity:
35
+ Max: 15
36
+
37
+ Metrics/PerceivedComplexity:
38
+ Max: 15
39
+
40
+ Style/Documentation:
41
+ Enabled: false
42
+
43
+ RSpec/NestedGroups:
44
+ Enabled: false
45
+
46
+ RSpec/MultipleExpectations:
47
+ Enabled: false
48
+
49
+ RSpec/ExampleLength:
50
+ Enabled: false
51
+
52
+ RSpec/ExpectActual:
53
+ Enabled: false
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Justyna Wojtczak
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,177 @@
1
+ # PriceScanner
2
+
3
+ Battle-tested multi-currency price extraction from text. Supports PLN, EUR, GBP, USD with Polish and English number formats.
4
+
5
+ ## Installation
6
+
7
+ ```ruby
8
+ gem "price_scanner"
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ### Parse a single price
14
+
15
+ ```ruby
16
+ PriceScanner.parse("1.299,00 zł")
17
+ # => { amount: 1299.0, currency: "PLN", text: "1.299,00 zł" }
18
+
19
+ PriceScanner.parse("£49.99")
20
+ # => { amount: 49.99, currency: "GBP", text: "£49.99" }
21
+ ```
22
+
23
+ ### Extract all prices from text
24
+
25
+ ```ruby
26
+ PriceScanner.scan("Was £49.99 Now £29.99")
27
+ # => [{ amount: 49.99, currency: "GBP", text: "£49.99" },
28
+ # { amount: 29.99, currency: "GBP", text: "£29.99" }]
29
+ ```
30
+
31
+ ### Check if text contains a price
32
+
33
+ ```ruby
34
+ PriceScanner.contains_price?("Only 99,00 zł") # => true
35
+ PriceScanner.contains_price?("No price here") # => false
36
+ ```
37
+
38
+ ### GDPR consent detection (optional, requires nokogiri)
39
+
40
+ ```ruby
41
+ require "nokogiri"
42
+
43
+ doc = Nokogiri::HTML(html)
44
+ node = doc.css(".cookie-banner").first
45
+ PriceScanner::ConsentDetector.consent_node?(node) # => true/false
46
+ ```
47
+
48
+ ### Advanced API
49
+
50
+ For finer control, use `Detector` and `Parser` modules directly.
51
+
52
+ #### Detect prices in text
53
+
54
+ ```ruby
55
+ PriceScanner::Detector.contains_price?("see price: 49,00 zł") # => true
56
+
57
+ PriceScanner::Detector.extract_prices_from_text("Was 49,00 zł, now 29,00 zł")
58
+ # => [{ text: "49,00 zł", value: 49.0, position: 4 },
59
+ # { text: "29,00 zł", value: 29.0, position: 18 }]
60
+
61
+ PriceScanner::Detector::PRICE_PATTERN # => Regexp matching prices
62
+ ```
63
+
64
+ #### Parse and normalize prices
65
+
66
+ ```ruby
67
+ PriceScanner::Parser.normalized_price("1.299,00 zł") # => 1299.0
68
+ PriceScanner::Parser.normalized_price("$49.99") # => 49.99
69
+
70
+ PriceScanner::Parser.extract_currency("49,00 zł") # => "PLN"
71
+ PriceScanner::Parser.extract_currency("€120") # => "EUR"
72
+ ```
73
+
74
+ #### Strip price mentions from text
75
+
76
+ ```ruby
77
+ PriceScanner::Parser.strip_price_mentions("Buy for 49,00 zł or 59,00 zł", "49,00 zł", "59,00 zł")
78
+ # => "Buy for or"
79
+ ```
80
+
81
+ #### Build a regex for a specific price value
82
+
83
+ ```ruby
84
+ PriceScanner::Parser.price_regex_from_value("1.299,00 zł")
85
+ # => Regexp matching variations like "1 299,00 zł", "1299,00zł", etc.
86
+ ```
87
+
88
+ ## Supported currencies
89
+
90
+ | Currency | Symbol | Code | Example input | Parsed |
91
+ |----------|--------|------|---------------|--------|
92
+ | PLN | `zł`, `zl` | `PLN` | `1.299,00 zł` | 1299.0 |
93
+ | EUR | `€` | `EUR` | `€99,00` | 99.0 |
94
+ | USD | `$` | `USD` | `$1,019.00` | 1019.0 |
95
+ | GBP | `£` | `GBP` | `£49.99` | 49.99 |
96
+
97
+ Currency symbols and codes are matched case-insensitively (`pln`, `PLN`, `Pln` all work).
98
+
99
+ ## Supported number formats
100
+
101
+ | Format | Example | Parsed |
102
+ |--------|---------|--------|
103
+ | Dot thousands, comma decimal (Polish) | `1.299,00 zł` | 1299.0 |
104
+ | Space thousands, comma decimal | `1 299,00 zł` | 1299.0 |
105
+ | NBSP thousands, comma decimal | `1\u00a0299,00 zł` | 1299.0 |
106
+ | Comma thousands, dot decimal (English) | `$1,299.00` | 1299.0 |
107
+ | No thousands separator | `799,00 zł` | 799.0 |
108
+ | Integer (no decimals) | `£150` | 150.0 |
109
+ | Currency before amount | `zł 248,86` | 248.86 |
110
+
111
+ ## Smart filtering
112
+
113
+ Prices that match the following patterns are automatically excluded from results:
114
+
115
+ ### Negative prices
116
+
117
+ Prices preceded by `-` or `−` (U+2212) are treated as discount badges and filtered out.
118
+
119
+ ```ruby
120
+ PriceScanner.scan("449,00 zł -100,00 zł 349,00 zł")
121
+ # => [{ amount: 449.0, ... }, { amount: 349.0, ... }]
122
+ # -100,00 zł is excluded
123
+ ```
124
+
125
+ ### Price ranges
126
+
127
+ Two prices connected by an en-dash (`–`, `—`) or spaced hyphen (` - `) are recognized as a range and both are removed.
128
+
129
+ ```ruby
130
+ PriceScanner.scan("Size S–XL, £3.29 – £92.71, buy now for £49.99")
131
+ # => [{ amount: 49.99, currency: "GBP", text: "£49.99" }]
132
+ # range £3.29 – £92.71 is excluded
133
+ ```
134
+
135
+ ### Savings amounts
136
+
137
+ When 3+ prices are detected and one equals the difference between two others (within ±2% tolerance), the savings amount is removed.
138
+
139
+ ```ruby
140
+ PriceScanner.scan("Was 449,00 zł, now 349,00 zł. You save 100,00 zł!")
141
+ # => [{ amount: 449.0, ... }, { amount: 349.0, ... }]
142
+ # 100,00 zł is excluded (449 - 349 = 100)
143
+ ```
144
+
145
+ ### Per-unit prices
146
+
147
+ Prices followed by a unit indicator are filtered out.
148
+
149
+ Supported units: `kg`, `g`, `mg`, `l`, `ml`, `szt`, `m`, `m²`, `m³`, `cm`, `mm`, `op`, `opak`, `pcs`, `pc`, `unit`, `each`, `ea`, `kaps`, `tabl`, `tab`
150
+
151
+ Recognized prefixes: `/` (slash) and `za` (Polish "per").
152
+
153
+ ```ruby
154
+ PriceScanner.scan("32,74 zł/kg — buy 500g for 16,37 zł")
155
+ # => [{ amount: 16.37, currency: "PLN", text: "16,37 zł" }]
156
+ # 32,74 zł/kg is excluded
157
+ ```
158
+
159
+ ### Deduplication
160
+
161
+ If the same price value appears multiple times, only one occurrence is kept.
162
+
163
+ ## Features
164
+
165
+ - **Zero dependencies** (nokogiri optional, only for consent detection)
166
+ - Case-insensitive currency matching
167
+ - Handles regular spaces, non-breaking spaces (NBSP), and mixed whitespace
168
+ - Tracks position of each price in the source text
169
+ - Ignores letter-preceded numbers to avoid false positives from product codes (e.g. `DKA2zł`)
170
+
171
+ ## Used by
172
+
173
+ - [snipe.sale](https://snipe.sale) — price tracking service processing thousands of product pages daily
174
+
175
+ ## License
176
+
177
+ MIT License. See [LICENSE](LICENSE).
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rspec/core/rake_task"
4
+ require "rubocop/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+ RuboCop::RakeTask.new
8
+
9
+ task default: %i[spec rubocop]
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PriceScanner
4
+ # Detects GDPR/cookie consent banners in HTML nodes (requires nokogiri).
5
+ module ConsentDetector
6
+ CONSENT_TEXT_REGEX = /
7
+ \bcookie\b|\bcookies\b|\bconsent\b|\bgdpr\b|\bprivacy\b|\btracking\b|\bpreferences\b|\bpersonaliz|marketing\s+cookies|
8
+ do\s+not\s+sell|opt\s+out|opt\s+in|cookie\s+policy|privacy\s+policy|
9
+ \bciasteczk(?:a|i|ami|ach|om)?\b|\bprywatn|\bzgod(?:a|y|ę|zie)?\b|\brodo\b
10
+ /ix
11
+ CONSENT_ACTION_REGEX = /
12
+ \baccept\b|\bagree\b|\ballow\b|\bmanage\b|\bpreferences\b|\bdecline\b|\breject\b|\bok\b|\bokay\b|\bcontinue\b|save\s+preferences|
13
+ accept\s+all|allow\s+all|got\s+it|\brozumiem\b|\bzgadzam\b|\bakceptuj|\bzaakceptuj|\bodrzuc|\bodmow
14
+ /ix
15
+ CONSENT_ATTR_REGEX = /
16
+ cookie|consent|gdpr|privacy|cmp|onetrust|trustarc|cookielaw|cookiebot|osano|
17
+ quantcast|usercentrics|didomi|cookieyes|termly|iubenda|shopify-pc__banner
18
+ /ix
19
+
20
+ ANCESTOR_DEPTH = 3
21
+
22
+ module_function
23
+
24
+ def consent_node?(node)
25
+ return false unless node
26
+
27
+ nodes = [node] + node.ancestors.take(ANCESTOR_DEPTH)
28
+ hits = detect_hits(nodes)
29
+ text_hit = hits[:text]
30
+ attr_hit = hits[:attr]
31
+ return false unless text_hit || attr_hit
32
+
33
+ (text_hit && hits[:action]) || attr_hit
34
+ end
35
+
36
+ def detect_hits(nodes)
37
+ result = { text: false, attr: false, action: false }
38
+ nodes.each do |item|
39
+ result[:text] ||= item.text.to_s.match?(CONSENT_TEXT_REGEX)
40
+ result[:attr] ||= attribute_text(item).match?(CONSENT_ATTR_REGEX)
41
+ result[:action] ||= action_button?(item)
42
+ end
43
+ result
44
+ end
45
+
46
+ ATTR_KEYS = %w[id class role aria-label aria-modal].freeze
47
+ ACTION_SELECTOR = "button, [role='button'], input[type='button'], input[type='submit'], a"
48
+
49
+ def attribute_text(node)
50
+ ATTR_KEYS.filter_map { |key| node[key] }.join(" ")
51
+ end
52
+
53
+ def action_button?(node)
54
+ node.css(ACTION_SELECTOR).any? do |button|
55
+ collect_text(button).match?(CONSENT_ACTION_REGEX)
56
+ end
57
+ end
58
+
59
+ def collect_text(node)
60
+ [node.text, node["aria-label"], node["title"], node["value"]].compact.join(" ")
61
+ end
62
+
63
+ private_class_method :detect_hits, :attribute_text, :action_button?, :collect_text
64
+ end
65
+ end
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PriceScanner
4
+ # Extracts prices from text using regex patterns with smart filtering.
5
+ module Detector
6
+ PRICE_PATTERN = /
7
+ (?:zł|pln|€|\$|£)[\s\u00a0]*(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})(?:[.,]\d{1,2})? |
8
+ (?<![a-zA-Z])(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})[.,]\d{2}[\s\u00a0]*(?:zł|pln|€|\$|£|eur|usd|gbp)(?!\d) |
9
+ (?<![a-zA-Z])(?:\d{1,3}(?:[.\s\u00a0]\d{3})+|\d{1,4})[\s\u00a0]*(?:zł|pln|€|\$|£)(?!\d)
10
+ /ix
11
+
12
+ PER_UNIT_PATTERN = %r{(?:/\s*|za\s+)(?:kg|g|mg|l|ml|szt|m[²³23]?|cm|mm|op|opak|pcs|pc|unit|each|ea|kaps|tabl|tab)\b}i
13
+ PER_UNIT_ANCHOR = /\A#{PER_UNIT_PATTERN.source}/i
14
+
15
+ NEGATIVE_PREFIXES = ["-", "\u2212"].freeze
16
+
17
+ RANGE_SEPARATOR_PATTERN = /\s*[–—]\s*|\s+-\s+/
18
+
19
+ TEXT_AFTER_LOOKAHEAD = 200
20
+ MIN_PRICES_FOR_RANGE = 2
21
+ MIN_PRICES_FOR_SAVINGS = 3
22
+ SAVINGS_MIN_RATIO = 0.1
23
+ SAVINGS_MIN_DIFF = 0.01
24
+ SAVINGS_TOLERANCE_RATIO = 0.02
25
+ SAVINGS_TOLERANCE_MIN = 1.0
26
+
27
+ module_function
28
+
29
+ def extract_prices_from_text(text)
30
+ text_str = text.to_s
31
+ raw_prices = scan_raw_prices(text_str)
32
+ filtered = filter_range_prices(raw_prices, text_str)
33
+ unique = filtered.uniq { |price| price[:value] }
34
+ filter_savings_by_difference(unique)
35
+ end
36
+
37
+ def contains_price?(text)
38
+ text.to_s.match?(PRICE_PATTERN)
39
+ end
40
+
41
+ def scan_raw_prices(text_str)
42
+ results = []
43
+ last_end = 0
44
+
45
+ text_str.scan(PRICE_PATTERN) do |match_str|
46
+ result, last_end = find_price_at(text_str, match_str, last_end)
47
+ results << result if result
48
+ end
49
+
50
+ results
51
+ end
52
+
53
+ def find_price_at(text_str, match_str, search_from)
54
+ return [nil, search_from] if match_str.empty?
55
+
56
+ match_index = text_str.index(match_str, search_from)
57
+ return [nil, search_from] unless match_index
58
+
59
+ match_end = match_index + match_str.length
60
+ [build_price_result(text_str, match_str, match_index), match_end]
61
+ end
62
+
63
+ def build_price_result(text_str, match_str, match_index)
64
+ value = Parser.normalized_price(match_str)
65
+ return unless value
66
+
67
+ return if negative_price?(text_str, match_index)
68
+ return if per_unit_price?(text_str, match_index + match_str.length)
69
+
70
+ clean_text = match_str.gsub(Parser::COLLAPSE_WHITESPACE, " ").strip
71
+ { text: clean_text, value: value, position: match_index }
72
+ end
73
+
74
+ def negative_price?(text_str, match_index)
75
+ match_index.positive? && NEGATIVE_PREFIXES.include?(text_str[match_index - 1])
76
+ end
77
+
78
+ def per_unit_price?(text_str, match_end)
79
+ text_after = text_str[match_end, TEXT_AFTER_LOOKAHEAD].to_s.gsub(Parser::COLLAPSE_WHITESPACE, " ").lstrip
80
+ text_after.match?(PER_UNIT_ANCHOR)
81
+ end
82
+
83
+ def filter_range_prices(prices, text)
84
+ return prices if prices.size < MIN_PRICES_FOR_RANGE
85
+
86
+ range_indices = find_range_indices(prices, text)
87
+ prices.reject.with_index { |_, idx| range_indices.include?(idx) }
88
+ end
89
+
90
+ def find_range_indices(prices, text)
91
+ indices = Set.new
92
+ prices.each_cons(2).with_index do |(current, next_price), idx|
93
+ if range_between?(current, next_price, text)
94
+ indices << idx
95
+ indices << (idx + 1)
96
+ end
97
+ end
98
+ indices
99
+ end
100
+
101
+ def range_between?(current, next_price, text)
102
+ start_pos = current[:position] + current[:text].length
103
+ end_pos = next_price[:position]
104
+ return false if end_pos <= start_pos
105
+
106
+ text[start_pos...end_pos].match?(RANGE_SEPARATOR_PATTERN)
107
+ end
108
+
109
+ def filter_savings_by_difference(prices)
110
+ return prices if prices.size < MIN_PRICES_FOR_SAVINGS
111
+
112
+ values = prices.map { |entry| entry[:value] }
113
+ min_value = values.min
114
+
115
+ return prices unless savings_amount?(values, min_value)
116
+
117
+ prices.zip(values).filter_map { |price, val| price unless val == min_value }
118
+ end
119
+
120
+ def savings_amount?(values, min_value)
121
+ values.combination(2).any? do |first, second|
122
+ next false if first == min_value || second == min_value
123
+
124
+ matches_savings_pattern?((first - second).abs, min_value)
125
+ end
126
+ end
127
+
128
+ def matches_savings_pattern?(diff, min_value)
129
+ return false if diff < [min_value * SAVINGS_MIN_RATIO, SAVINGS_MIN_DIFF].max
130
+
131
+ tolerance = [min_value * SAVINGS_TOLERANCE_RATIO, SAVINGS_TOLERANCE_MIN].max
132
+ (min_value - diff).abs <= tolerance
133
+ end
134
+
135
+ private_class_method :scan_raw_prices, :find_price_at, :build_price_result,
136
+ :negative_price?, :per_unit_price?,
137
+ :filter_range_prices, :find_range_indices, :range_between?,
138
+ :filter_savings_by_difference, :savings_amount?, :matches_savings_pattern?
139
+ end
140
+ end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PriceScanner
4
+ # Normalizes price strings into floats and extracts currency codes.
5
+ module Parser
6
+ CURRENCY_MAP = {
7
+ "zł" => "PLN", "pln" => "PLN", "zl" => "PLN",
8
+ "€" => "EUR", "eur" => "EUR",
9
+ "$" => "USD", "usd" => "USD",
10
+ "£" => "GBP", "gbp" => "GBP"
11
+ }.freeze
12
+
13
+ CURRENCY_SYMBOLS = CURRENCY_MAP.keys.map { |key| Regexp.escape(key) }.freeze
14
+ CURRENCY_REGEX = /(#{CURRENCY_SYMBOLS.join("|")})/i
15
+ CURRENCY_SUFFIX = /(?:#{CURRENCY_SYMBOLS.join("|")})/i
16
+
17
+ MULTIPLE_SPACES = /\s{2,}/
18
+ COLLAPSE_WHITESPACE = /\s+/
19
+ NBSP = "\u00a0"
20
+ DECIMAL_PLACES = 2
21
+ THOUSANDS_GROUP = /.{1,3}/
22
+
23
+ module_function
24
+
25
+ def normalized_price(value)
26
+ text = value.to_s.tr(NBSP, " ").strip
27
+ return nil if text.empty?
28
+
29
+ clean = clean_price_text(text)
30
+ return nil unless clean
31
+
32
+ Float(clean)
33
+ rescue ArgumentError, TypeError
34
+ nil
35
+ end
36
+
37
+ def extract_currency(value)
38
+ text = value.to_s
39
+ return nil if text.empty?
40
+
41
+ match = text.match(CURRENCY_REGEX)
42
+ resolve_currency(match)
43
+ end
44
+
45
+ def strip_price_mentions(text, *prices)
46
+ cleaned = text.to_s.tr(NBSP, " ")
47
+ prices.compact.each do |price|
48
+ cleaned = strip_single_price(cleaned, price)
49
+ end
50
+ cleaned.gsub(MULTIPLE_SPACES, " ").strip
51
+ end
52
+
53
+ def price_regex_from_value(value)
54
+ integer, decimals = split_price_parts(value)
55
+ int_pattern = thousands_pattern(integer)
56
+ /#{int_pattern}[.,]#{decimals}\s?#{CURRENCY_SUFFIX.source}?/i
57
+ end
58
+
59
+ def split_price_parts(value)
60
+ format("%.#{DECIMAL_PLACES}f", value).split(".")
61
+ end
62
+
63
+ def thousands_groups(integer)
64
+ integer.reverse.scan(THOUSANDS_GROUP).map(&:reverse).reverse
65
+ end
66
+
67
+ def thousands_pattern(integer)
68
+ thousands_groups(integer).join("[\\s\\u00a0]?")
69
+ end
70
+
71
+ def clean_price_text(text)
72
+ digits = text.gsub(/[^\d.,\s]/, "")
73
+ return nil if digits.empty?
74
+
75
+ normalize_separators(digits).gsub(/\s/, "")
76
+ end
77
+
78
+ def resolve_currency(match)
79
+ return nil unless match
80
+
81
+ symbol = match[1]
82
+ CURRENCY_MAP.fetch(symbol.downcase, symbol.upcase)
83
+ end
84
+
85
+ def strip_single_price(cleaned, price)
86
+ normalized = price.to_s.tr(NBSP, " ").strip
87
+ return cleaned if normalized.empty?
88
+
89
+ result = cleaned.gsub(normalized, "").gsub(normalized.delete(" "), "")
90
+ price_value = normalized_price(price)
91
+ return result unless price_value
92
+
93
+ result.gsub(price_regex_from_value(price_value), "")
94
+ end
95
+
96
+ def normalize_separators(clean)
97
+ return clean unless clean.include?(",")
98
+
99
+ if clean.include?(".")
100
+ resolve_mixed_separators(clean)
101
+ else
102
+ resolve_comma_only(clean)
103
+ end
104
+ end
105
+
106
+ def resolve_mixed_separators(clean)
107
+ if clean.rindex(",") > clean.rindex(".")
108
+ clean.delete(".").tr(",", ".")
109
+ else
110
+ clean.delete(",")
111
+ end
112
+ end
113
+
114
+ def resolve_comma_only(clean)
115
+ parts = clean.split(",")
116
+ if parts.size == 2
117
+ clean.tr(",", ".")
118
+ else
119
+ "#{parts[0...-1].join}.#{parts.last}"
120
+ end
121
+ end
122
+
123
+ private_class_method :clean_price_text, :resolve_currency, :strip_single_price,
124
+ :normalize_separators, :resolve_mixed_separators,
125
+ :resolve_comma_only
126
+ end
127
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PriceScanner
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ Dir.glob(File.join(__dir__, "price_scanner", "*.rb")).each { |f| require_relative f }
4
+
5
+ # Multi-currency price extraction from text.
6
+ module PriceScanner
7
+ module_function
8
+
9
+ def parse(text)
10
+ prices = Detector.extract_prices_from_text(text)
11
+ return nil if prices.empty?
12
+
13
+ build_result(prices.first)
14
+ end
15
+
16
+ def scan(text)
17
+ Detector.extract_prices_from_text(text).map { |price| build_result(price) }
18
+ end
19
+
20
+ def contains_price?(text)
21
+ Detector.contains_price?(text)
22
+ end
23
+
24
+ def build_result(price)
25
+ price_text = price[:text]
26
+ { amount: price[:value], currency: Parser.extract_currency(price_text), text: price_text }
27
+ end
28
+
29
+ private_class_method :build_result
30
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: price_scanner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Justyna
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: Battle-tested price parser supporting PLN, EUR, GBP, USD. Extracts prices
13
+ from text, handles Polish and English number formats, filters savings badges and
14
+ price ranges.
15
+ email:
16
+ - justine84@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".reek.yml"
22
+ - ".rspec"
23
+ - ".rubocop.yml"
24
+ - LICENSE
25
+ - README.md
26
+ - Rakefile
27
+ - lib/price_scanner.rb
28
+ - lib/price_scanner/consent_detector.rb
29
+ - lib/price_scanner/detector.rb
30
+ - lib/price_scanner/parser.rb
31
+ - lib/price_scanner/version.rb
32
+ homepage: https://github.com/justi/price_scanner
33
+ licenses:
34
+ - MIT
35
+ metadata:
36
+ homepage_uri: https://github.com/justi/price_scanner
37
+ source_code_uri: https://github.com/justi/price_scanner
38
+ changelog_uri: https://github.com/justi/price_scanner/blob/main/CHANGELOG.md
39
+ rubygems_mfa_required: 'true'
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 3.1.0
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubygems_version: 3.6.7
55
+ specification_version: 4
56
+ summary: Multi-currency price extraction from text
57
+ test_files: []