lighterpack-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 443e65a776623c5e587889eecf74da63064015eae8a3d1882b3e0a5ad5864675
4
+ data.tar.gz: 25cb7cf2b5cea5deeadcc82a53665ac999c69895ee3754a3174929a6b43f3264
5
+ SHA512:
6
+ metadata.gz: 95e5525bd874996437de92e6a486a530ca23677e035c640bb7b4d7b2a7662189a67ea97a7cfa19913973e7ae875822517b05e73a66c0e0060dc1c28d27e06128
7
+ data.tar.gz: 14af29e67a8f67ed1552e7e9dc2801005a214d800ab788dea0fc1add26478c7df4e780262278271a3f33945d6866f2df42fa511770df083e9c8f4383ff530bb1
data/README.md ADDED
@@ -0,0 +1,108 @@
1
+ # Lighterpack Parser
2
+
3
+ A Ruby gem for parsing Lighterpack lists from HTML or URLs.
4
+
5
+ ## Installation
6
+
7
+ This gem is used as a local dependency in the Packlista project. It's referenced in the backend `Gemfile`:
8
+
9
+ ```ruby
10
+ gem 'lighterpack_parser', path: '../lighterpack-parser'
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ### Parse from HTML string
16
+
17
+ ```ruby
18
+ require 'lighterpack_parser'
19
+
20
+ html = File.read('path/to/lighterpack.html')
21
+ result = LighterpackParser::Parser.new(html: html).parse
22
+
23
+ # Result structure:
24
+ # {
25
+ # name: "List Name",
26
+ # description: "List description (optional)",
27
+ # categories: [
28
+ # {
29
+ # name: "Category Name",
30
+ # description: "Category description (optional)",
31
+ # items: [
32
+ # {
33
+ # name: "Item Name",
34
+ # description: "Item description",
35
+ # weight: 476.0, # in grams
36
+ # quantity: 1,
37
+ # image_url: "https://...",
38
+ # consumable: false,
39
+ # worn: false
40
+ # }
41
+ # ]
42
+ # }
43
+ # ]
44
+ # }
45
+ ```
46
+
47
+ ### Parse from URL
48
+
49
+ ```ruby
50
+ require 'lighterpack_parser'
51
+
52
+ # Using the parser directly
53
+ result = LighterpackParser::Parser.new(url: 'https://lighterpack.com/r/b6q1kr').parse
54
+
55
+ # Or using the convenience method
56
+ result = LighterpackParser.parse_url('https://lighterpack.com/r/b6q1kr')
57
+ ```
58
+
59
+ ## Running Tests
60
+
61
+ To run the test suite:
62
+
63
+ ```bash
64
+ rspec
65
+ ```
66
+
67
+ ## Test Fixtures
68
+
69
+ Test fixtures are stored in `test/fixtures/` and contain HTML from example Lighterpack lists:
70
+ - `b6q1kr.html` - Ultimate Hike 2025
71
+ - `adbf7c.html` - Example list 2
72
+ - `h23rxt.html` - Example list 3
73
+
74
+ To update fixtures, download fresh HTML:
75
+
76
+ ```bash
77
+ curl -s "https://lighterpack.com/r/b6q1kr" > test/fixtures/b6q1kr.html
78
+ ```
79
+
80
+ ## Features
81
+
82
+ - Parses list name and description
83
+ - Extracts categories with descriptions
84
+ - Extracts items with:
85
+ - Name and description
86
+ - Weight (automatically converted to grams)
87
+ - Quantity
88
+ - Image URLs
89
+ - Consumable flag
90
+ - Worn flag
91
+ - Supports weight units: oz, lb, g, kg (all converted to grams)
92
+ - Handles both HTML strings and URLs
93
+
94
+ ## Weight Conversion
95
+
96
+ The parser automatically converts all weights to grams:
97
+ - `oz` → multiply by 28.3495
98
+ - `lb` → multiply by 453.592
99
+ - `g` → use as-is
100
+ - `kg` → multiply by 1000
101
+
102
+ ## Development
103
+
104
+ To install dependencies locally:
105
+
106
+ ```bash
107
+ bundle install
108
+ ```
@@ -0,0 +1,264 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'httparty'
5
+
6
+ module LighterpackParser
7
+ class Parser
8
+ def initialize(html: nil, url: nil)
9
+ @html = if url
10
+ fetch_html(url)
11
+ elsif html
12
+ html
13
+ else
14
+ raise ArgumentError, 'Either html or url must be provided'
15
+ end
16
+ end
17
+
18
+ def parse
19
+ doc = Nokogiri::HTML(@html)
20
+
21
+ {
22
+ name: extract_list_name(doc),
23
+ description: extract_list_description(doc),
24
+ categories: extract_categories(doc)
25
+ }
26
+ end
27
+
28
+ private
29
+
30
+ def fetch_html(url)
31
+ response = HTTParty.get(url, timeout: 30)
32
+ raise "Failed to fetch URL: #{response.code}" unless response.success?
33
+ response.body
34
+ end
35
+
36
+ def extract_list_name(doc)
37
+ # Lighterpack uses h1.lpListName
38
+ h1 = doc.at_css('h1.lpListName')
39
+ return h1.text.strip if h1
40
+
41
+ # Fallback to regular h1
42
+ h1 = doc.at_css('h1')
43
+ return h1.text.strip if h1
44
+
45
+ title = doc.at_css('title')
46
+ return title.text.strip if title
47
+
48
+ 'Untitled List'
49
+ end
50
+
51
+ def extract_list_description(doc)
52
+ # Lighterpack doesn't seem to have a list description in the HTML
53
+ # Could be in meta tags
54
+ meta_desc = doc.at_css('meta[name="description"]')
55
+ return meta_desc['content'] if meta_desc && meta_desc['content']
56
+
57
+ nil
58
+ end
59
+
60
+ def extract_categories(doc)
61
+ categories = []
62
+
63
+ # Lighterpack structure: ul.lpCategories > li.lpCategory
64
+ doc.css('ul.lpCategories > li.lpCategory').each do |category_element|
65
+ # Category name is in h2.lpCategoryName
66
+ category_header = category_element.at_css('h2.lpCategoryName')
67
+ next unless category_header
68
+
69
+ category_name = category_header.text.strip
70
+ next if category_name.empty?
71
+
72
+ # Description is typically in the category name itself (in parentheses)
73
+ description = extract_category_description(category_name)
74
+
75
+ # Find items in this category
76
+ items = extract_items_for_category(category_element)
77
+
78
+ categories << {
79
+ name: category_name,
80
+ description: description,
81
+ items: items
82
+ }
83
+ end
84
+
85
+ categories
86
+ end
87
+
88
+ def extract_category_description(category_name)
89
+ # Description is often in parentheses in the category name
90
+ # e.g., "Big 3 (Pack, Tent, Sleep System)"
91
+ match = category_name.match(/\(([^)]+)\)/)
92
+ return match[1] if match
93
+
94
+ nil
95
+ end
96
+
97
+ def extract_items_for_category(category_element)
98
+ items = []
99
+
100
+ # Items are in ul.lpItems within the category
101
+ items_list = category_element.at_css('ul.lpItems')
102
+ return items unless items_list
103
+
104
+ # Extract items (skip header row)
105
+ items_list.css('li.lpItem').each do |item_element|
106
+ item = extract_item(item_element)
107
+ items << item if item && item[:name]
108
+ end
109
+
110
+ items
111
+ end
112
+
113
+ def extract_item(element)
114
+ # Extract item data from the element
115
+ # Lighterpack items have: name, weight, quantity, description, image
116
+ name = extract_item_name(element)
117
+ return nil unless name
118
+
119
+ weight_data = extract_weight(element)
120
+ quantity = extract_quantity(element)
121
+ description = extract_item_description(element)
122
+ image_url = extract_image_url(element)
123
+ consumable = extract_consumable_flag(element)
124
+ worn = extract_worn_flag(element)
125
+
126
+ {
127
+ name: name,
128
+ description: description,
129
+ weight: weight_data[:weight_grams],
130
+ quantity: quantity,
131
+ image_url: image_url,
132
+ consumable: consumable,
133
+ worn: worn
134
+ }
135
+ end
136
+
137
+ def extract_item_name(element)
138
+ # Item name is in span.lpName
139
+ name_elem = element.at_css('span.lpName')
140
+ return name_elem.text.strip if name_elem
141
+
142
+ nil
143
+ end
144
+
145
+ def extract_weight(element)
146
+ # Lighterpack stores weight in milligrams in input.lpMG
147
+ mg_input = element.at_css('input.lpMG')
148
+ if mg_input && mg_input['value']
149
+ # Convert from milligrams to grams
150
+ weight_grams = mg_input['value'].to_f / 1000.0
151
+ return { weight_grams: weight_grams, original_unit: 'g' }
152
+ end
153
+
154
+ # Fallback: try to get from span.lpWeight and unit
155
+ weight_elem = element.at_css('span.lpWeight')
156
+ unit_elem = element.at_css('span.lpDisplay, select.lpUnit option[selected]')
157
+
158
+ if weight_elem
159
+ weight_value = weight_elem.text.strip.to_f
160
+ unit = 'g' # default
161
+
162
+ if unit_elem
163
+ unit_text = unit_elem.text.strip.downcase
164
+ unit = unit_text if ['oz', 'lb', 'g', 'kg'].include?(unit_text)
165
+ end
166
+
167
+ weight_grams = convert_to_grams(weight_value, unit)
168
+ return { weight_grams: weight_grams, original_unit: unit }
169
+ end
170
+
171
+ { weight_grams: 0.0, original_unit: 'g' }
172
+ end
173
+
174
+ def convert_to_grams(value, unit)
175
+ case unit.downcase
176
+ when 'oz'
177
+ value * 28.3495
178
+ when 'lb'
179
+ value * 453.592
180
+ when 'g'
181
+ value
182
+ when 'kg'
183
+ value * 1000
184
+ else
185
+ value # Default to assuming grams
186
+ end
187
+ end
188
+
189
+ def extract_quantity(element)
190
+ # Quantity is in span.lpQtyCell
191
+ qty_elem = element.at_css('span.lpQtyCell')
192
+ if qty_elem
193
+ qty_text = qty_elem.text.strip
194
+ return qty_text.to_i if qty_text.match?(/^\d+$/)
195
+ end
196
+
197
+ # Check qty attribute
198
+ qty_attr = element['qty']
199
+ return qty_attr.to_i if qty_attr
200
+
201
+ 1 # Default quantity
202
+ end
203
+
204
+ def extract_item_description(element)
205
+ # Description is in span.lpDescription
206
+ desc_elem = element.at_css('span.lpDescription')
207
+ return desc_elem.text.strip if desc_elem && !desc_elem.text.strip.empty?
208
+
209
+ nil
210
+ end
211
+
212
+ def extract_image_url(element)
213
+ # Image URL is in img.lpItemImage
214
+ img = element.at_css('img.lpItemImage')
215
+ if img && img['src']
216
+ # Decode HTML entities
217
+ url = img['src'].gsub('&#x2F;', '/').gsub('&#x3D;', '=')
218
+ return url
219
+ end
220
+
221
+ # Also check href attribute
222
+ if img && img['href']
223
+ url = img['href'].gsub('&#x2F;', '/').gsub('&#x3D;', '=')
224
+ return url
225
+ end
226
+
227
+ nil
228
+ end
229
+
230
+ def extract_consumable_flag(element)
231
+ # Check for consumable icon with lpActive class (only active items have lpActive)
232
+ # Try CSS selector first - Nokogiri should handle multiple classes
233
+ consumable_active = element.at_css('i.lpSprite.lpConsumable.lpActive')
234
+ return true if consumable_active
235
+
236
+ # Fallback: check class attribute directly
237
+ consumable_icon = element.at_css('i.lpSprite.lpConsumable')
238
+ return false unless consumable_icon
239
+
240
+ class_attr = consumable_icon['class'].to_s
241
+ # Check if lpActive appears in the class string (handles extra spaces)
242
+ return true if class_attr.include?('lpActive')
243
+
244
+ false
245
+ end
246
+
247
+ def extract_worn_flag(element)
248
+ # Check for worn icon with lpActive class (only active items have lpActive)
249
+ # Try CSS selector first - Nokogiri should handle multiple classes
250
+ worn_active = element.at_css('i.lpSprite.lpWorn.lpActive')
251
+ return true if worn_active
252
+
253
+ # Fallback: check class attribute directly
254
+ worn_icon = element.at_css('i.lpSprite.lpWorn')
255
+ return false unless worn_icon
256
+
257
+ class_attr = worn_icon['class'].to_s
258
+ # Check if lpActive appears in the class string (handles extra spaces)
259
+ return true if class_attr.include?('lpActive')
260
+
261
+ false
262
+ end
263
+ end
264
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LighterpackParser
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lighterpack_parser/version'
4
+ require_relative 'lighterpack_parser/parser'
5
+
6
+ module LighterpackParser
7
+ # Convenience method to parse a Lighterpack URL
8
+ def self.parse_url(url)
9
+ Parser.new(url: url).parse
10
+ end
11
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/lighterpack_parser/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'lighterpack-parser'
7
+ spec.version = LighterpackParser::VERSION
8
+ spec.authors = ['Packlista Team']
9
+ spec.email = ['team@packlista.com']
10
+
11
+ spec.summary = 'Parser for Lighterpack lists'
12
+ spec.description = 'Parse Lighterpack HTML to extract list data including categories, items, weights, and metadata'
13
+ spec.homepage = 'https://github.com/alex-ross/lighterpack-parser'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = Dir['lib/**/*', 'spec/**/*', '*.md', '*.gemspec']
17
+ spec.require_paths = ['lib']
18
+
19
+ spec.add_dependency 'nokogiri', '~> 1.15'
20
+ spec.add_dependency 'httparty', '~> 0.21'
21
+
22
+ spec.add_development_dependency 'rspec', '~> 3.12'
23
+ end