lighterpack-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +108 -0
- data/lib/lighterpack_parser/parser.rb +264 -0
- data/lib/lighterpack_parser/version.rb +5 -0
- data/lib/lighterpack_parser.rb +11 -0
- data/lighterpack-parser.gemspec +23 -0
- data/spec/fixtures/adbf7c.html +2911 -0
- data/spec/fixtures/b6q1kr.html +2948 -0
- data/spec/fixtures/h23rxt.html +1660 -0
- data/spec/parser_spec.rb +218 -0
- data/spec/spec_helper.rb +15 -0
- metadata +96 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 443e65a776623c5e587889eecf74da63064015eae8a3d1882b3e0a5ad5864675
|
|
4
|
+
data.tar.gz: 25cb7cf2b5cea5deeadcc82a53665ac999c69895ee3754a3174929a6b43f3264
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 95e5525bd874996437de92e6a486a530ca23677e035c640bb7b4d7b2a7662189a67ea97a7cfa19913973e7ae875822517b05e73a66c0e0060dc1c28d27e06128
|
|
7
|
+
data.tar.gz: 14af29e67a8f67ed1552e7e9dc2801005a214d800ab788dea0fc1add26478c7df4e780262278271a3f33945d6866f2df42fa511770df083e9c8f4383ff530bb1
|
data/README.md
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Lighterpack Parser
|
|
2
|
+
|
|
3
|
+
A Ruby gem for parsing Lighterpack lists from HTML or URLs.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
This gem is used as a local dependency in the Packlista project. It's referenced in the backend `Gemfile`:
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
gem 'lighterpack_parser', path: '../lighterpack-parser'
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
### Parse from HTML string
|
|
16
|
+
|
|
17
|
+
```ruby
|
|
18
|
+
require 'lighterpack_parser'
|
|
19
|
+
|
|
20
|
+
html = File.read('path/to/lighterpack.html')
|
|
21
|
+
result = LighterpackParser::Parser.new(html: html).parse
|
|
22
|
+
|
|
23
|
+
# Result structure:
|
|
24
|
+
# {
|
|
25
|
+
# name: "List Name",
|
|
26
|
+
# description: "List description (optional)",
|
|
27
|
+
# categories: [
|
|
28
|
+
# {
|
|
29
|
+
# name: "Category Name",
|
|
30
|
+
# description: "Category description (optional)",
|
|
31
|
+
# items: [
|
|
32
|
+
# {
|
|
33
|
+
# name: "Item Name",
|
|
34
|
+
# description: "Item description",
|
|
35
|
+
# weight: 476.0, # in grams
|
|
36
|
+
# quantity: 1,
|
|
37
|
+
# image_url: "https://...",
|
|
38
|
+
# consumable: false,
|
|
39
|
+
# worn: false
|
|
40
|
+
# }
|
|
41
|
+
# ]
|
|
42
|
+
# }
|
|
43
|
+
# ]
|
|
44
|
+
# }
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Parse from URL
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
require 'lighterpack_parser'
|
|
51
|
+
|
|
52
|
+
# Using the parser directly
|
|
53
|
+
result = LighterpackParser::Parser.new(url: 'https://lighterpack.com/r/b6q1kr').parse
|
|
54
|
+
|
|
55
|
+
# Or using the convenience method
|
|
56
|
+
result = LighterpackParser.parse_url('https://lighterpack.com/r/b6q1kr')
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Running Tests
|
|
60
|
+
|
|
61
|
+
To run the test suite:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
rspec
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Test Fixtures
|
|
68
|
+
|
|
69
|
+
Test fixtures are stored in `test/fixtures/` and contain HTML from example Lighterpack lists:
|
|
70
|
+
- `b6q1kr.html` - Ultimate Hike 2025
|
|
71
|
+
- `adbf7c.html` - Example list 2
|
|
72
|
+
- `h23rxt.html` - Example list 3
|
|
73
|
+
|
|
74
|
+
To update fixtures, download fresh HTML:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
curl -s "https://lighterpack.com/r/b6q1kr" > test/fixtures/b6q1kr.html
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Features
|
|
81
|
+
|
|
82
|
+
- Parses list name and description
|
|
83
|
+
- Extracts categories with descriptions
|
|
84
|
+
- Extracts items with:
|
|
85
|
+
- Name and description
|
|
86
|
+
- Weight (automatically converted to grams)
|
|
87
|
+
- Quantity
|
|
88
|
+
- Image URLs
|
|
89
|
+
- Consumable flag
|
|
90
|
+
- Worn flag
|
|
91
|
+
- Supports weight units: oz, lb, g, kg (all converted to grams)
|
|
92
|
+
- Handles both HTML strings and URLs
|
|
93
|
+
|
|
94
|
+
## Weight Conversion
|
|
95
|
+
|
|
96
|
+
The parser automatically converts all weights to grams:
|
|
97
|
+
- `oz` → multiply by 28.3495
|
|
98
|
+
- `lb` → multiply by 453.592
|
|
99
|
+
- `g` → use as-is
|
|
100
|
+
- `kg` → multiply by 1000
|
|
101
|
+
|
|
102
|
+
## Development
|
|
103
|
+
|
|
104
|
+
To install dependencies locally:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
bundle install
|
|
108
|
+
```
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
require 'httparty'
|
|
5
|
+
|
|
6
|
+
module LighterpackParser
|
|
7
|
+
class Parser
|
|
8
|
+
def initialize(html: nil, url: nil)
|
|
9
|
+
@html = if url
|
|
10
|
+
fetch_html(url)
|
|
11
|
+
elsif html
|
|
12
|
+
html
|
|
13
|
+
else
|
|
14
|
+
raise ArgumentError, 'Either html or url must be provided'
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def parse
|
|
19
|
+
doc = Nokogiri::HTML(@html)
|
|
20
|
+
|
|
21
|
+
{
|
|
22
|
+
name: extract_list_name(doc),
|
|
23
|
+
description: extract_list_description(doc),
|
|
24
|
+
categories: extract_categories(doc)
|
|
25
|
+
}
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def fetch_html(url)
|
|
31
|
+
response = HTTParty.get(url, timeout: 30)
|
|
32
|
+
raise "Failed to fetch URL: #{response.code}" unless response.success?
|
|
33
|
+
response.body
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def extract_list_name(doc)
|
|
37
|
+
# Lighterpack uses h1.lpListName
|
|
38
|
+
h1 = doc.at_css('h1.lpListName')
|
|
39
|
+
return h1.text.strip if h1
|
|
40
|
+
|
|
41
|
+
# Fallback to regular h1
|
|
42
|
+
h1 = doc.at_css('h1')
|
|
43
|
+
return h1.text.strip if h1
|
|
44
|
+
|
|
45
|
+
title = doc.at_css('title')
|
|
46
|
+
return title.text.strip if title
|
|
47
|
+
|
|
48
|
+
'Untitled List'
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def extract_list_description(doc)
|
|
52
|
+
# Lighterpack doesn't seem to have a list description in the HTML
|
|
53
|
+
# Could be in meta tags
|
|
54
|
+
meta_desc = doc.at_css('meta[name="description"]')
|
|
55
|
+
return meta_desc['content'] if meta_desc && meta_desc['content']
|
|
56
|
+
|
|
57
|
+
nil
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def extract_categories(doc)
|
|
61
|
+
categories = []
|
|
62
|
+
|
|
63
|
+
# Lighterpack structure: ul.lpCategories > li.lpCategory
|
|
64
|
+
doc.css('ul.lpCategories > li.lpCategory').each do |category_element|
|
|
65
|
+
# Category name is in h2.lpCategoryName
|
|
66
|
+
category_header = category_element.at_css('h2.lpCategoryName')
|
|
67
|
+
next unless category_header
|
|
68
|
+
|
|
69
|
+
category_name = category_header.text.strip
|
|
70
|
+
next if category_name.empty?
|
|
71
|
+
|
|
72
|
+
# Description is typically in the category name itself (in parentheses)
|
|
73
|
+
description = extract_category_description(category_name)
|
|
74
|
+
|
|
75
|
+
# Find items in this category
|
|
76
|
+
items = extract_items_for_category(category_element)
|
|
77
|
+
|
|
78
|
+
categories << {
|
|
79
|
+
name: category_name,
|
|
80
|
+
description: description,
|
|
81
|
+
items: items
|
|
82
|
+
}
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
categories
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def extract_category_description(category_name)
|
|
89
|
+
# Description is often in parentheses in the category name
|
|
90
|
+
# e.g., "Big 3 (Pack, Tent, Sleep System)"
|
|
91
|
+
match = category_name.match(/\(([^)]+)\)/)
|
|
92
|
+
return match[1] if match
|
|
93
|
+
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def extract_items_for_category(category_element)
|
|
98
|
+
items = []
|
|
99
|
+
|
|
100
|
+
# Items are in ul.lpItems within the category
|
|
101
|
+
items_list = category_element.at_css('ul.lpItems')
|
|
102
|
+
return items unless items_list
|
|
103
|
+
|
|
104
|
+
# Extract items (skip header row)
|
|
105
|
+
items_list.css('li.lpItem').each do |item_element|
|
|
106
|
+
item = extract_item(item_element)
|
|
107
|
+
items << item if item && item[:name]
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
items
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def extract_item(element)
|
|
114
|
+
# Extract item data from the element
|
|
115
|
+
# Lighterpack items have: name, weight, quantity, description, image
|
|
116
|
+
name = extract_item_name(element)
|
|
117
|
+
return nil unless name
|
|
118
|
+
|
|
119
|
+
weight_data = extract_weight(element)
|
|
120
|
+
quantity = extract_quantity(element)
|
|
121
|
+
description = extract_item_description(element)
|
|
122
|
+
image_url = extract_image_url(element)
|
|
123
|
+
consumable = extract_consumable_flag(element)
|
|
124
|
+
worn = extract_worn_flag(element)
|
|
125
|
+
|
|
126
|
+
{
|
|
127
|
+
name: name,
|
|
128
|
+
description: description,
|
|
129
|
+
weight: weight_data[:weight_grams],
|
|
130
|
+
quantity: quantity,
|
|
131
|
+
image_url: image_url,
|
|
132
|
+
consumable: consumable,
|
|
133
|
+
worn: worn
|
|
134
|
+
}
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def extract_item_name(element)
|
|
138
|
+
# Item name is in span.lpName
|
|
139
|
+
name_elem = element.at_css('span.lpName')
|
|
140
|
+
return name_elem.text.strip if name_elem
|
|
141
|
+
|
|
142
|
+
nil
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def extract_weight(element)
|
|
146
|
+
# Lighterpack stores weight in milligrams in input.lpMG
|
|
147
|
+
mg_input = element.at_css('input.lpMG')
|
|
148
|
+
if mg_input && mg_input['value']
|
|
149
|
+
# Convert from milligrams to grams
|
|
150
|
+
weight_grams = mg_input['value'].to_f / 1000.0
|
|
151
|
+
return { weight_grams: weight_grams, original_unit: 'g' }
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Fallback: try to get from span.lpWeight and unit
|
|
155
|
+
weight_elem = element.at_css('span.lpWeight')
|
|
156
|
+
unit_elem = element.at_css('span.lpDisplay, select.lpUnit option[selected]')
|
|
157
|
+
|
|
158
|
+
if weight_elem
|
|
159
|
+
weight_value = weight_elem.text.strip.to_f
|
|
160
|
+
unit = 'g' # default
|
|
161
|
+
|
|
162
|
+
if unit_elem
|
|
163
|
+
unit_text = unit_elem.text.strip.downcase
|
|
164
|
+
unit = unit_text if ['oz', 'lb', 'g', 'kg'].include?(unit_text)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
weight_grams = convert_to_grams(weight_value, unit)
|
|
168
|
+
return { weight_grams: weight_grams, original_unit: unit }
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
{ weight_grams: 0.0, original_unit: 'g' }
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def convert_to_grams(value, unit)
|
|
175
|
+
case unit.downcase
|
|
176
|
+
when 'oz'
|
|
177
|
+
value * 28.3495
|
|
178
|
+
when 'lb'
|
|
179
|
+
value * 453.592
|
|
180
|
+
when 'g'
|
|
181
|
+
value
|
|
182
|
+
when 'kg'
|
|
183
|
+
value * 1000
|
|
184
|
+
else
|
|
185
|
+
value # Default to assuming grams
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def extract_quantity(element)
|
|
190
|
+
# Quantity is in span.lpQtyCell
|
|
191
|
+
qty_elem = element.at_css('span.lpQtyCell')
|
|
192
|
+
if qty_elem
|
|
193
|
+
qty_text = qty_elem.text.strip
|
|
194
|
+
return qty_text.to_i if qty_text.match?(/^\d+$/)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Check qty attribute
|
|
198
|
+
qty_attr = element['qty']
|
|
199
|
+
return qty_attr.to_i if qty_attr
|
|
200
|
+
|
|
201
|
+
1 # Default quantity
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def extract_item_description(element)
|
|
205
|
+
# Description is in span.lpDescription
|
|
206
|
+
desc_elem = element.at_css('span.lpDescription')
|
|
207
|
+
return desc_elem.text.strip if desc_elem && !desc_elem.text.strip.empty?
|
|
208
|
+
|
|
209
|
+
nil
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def extract_image_url(element)
|
|
213
|
+
# Image URL is in img.lpItemImage
|
|
214
|
+
img = element.at_css('img.lpItemImage')
|
|
215
|
+
if img && img['src']
|
|
216
|
+
# Decode HTML entities
|
|
217
|
+
url = img['src'].gsub('/', '/').gsub('=', '=')
|
|
218
|
+
return url
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Also check href attribute
|
|
222
|
+
if img && img['href']
|
|
223
|
+
url = img['href'].gsub('/', '/').gsub('=', '=')
|
|
224
|
+
return url
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
nil
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def extract_consumable_flag(element)
|
|
231
|
+
# Check for consumable icon with lpActive class (only active items have lpActive)
|
|
232
|
+
# Try CSS selector first - Nokogiri should handle multiple classes
|
|
233
|
+
consumable_active = element.at_css('i.lpSprite.lpConsumable.lpActive')
|
|
234
|
+
return true if consumable_active
|
|
235
|
+
|
|
236
|
+
# Fallback: check class attribute directly
|
|
237
|
+
consumable_icon = element.at_css('i.lpSprite.lpConsumable')
|
|
238
|
+
return false unless consumable_icon
|
|
239
|
+
|
|
240
|
+
class_attr = consumable_icon['class'].to_s
|
|
241
|
+
# Check if lpActive appears in the class string (handles extra spaces)
|
|
242
|
+
return true if class_attr.include?('lpActive')
|
|
243
|
+
|
|
244
|
+
false
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def extract_worn_flag(element)
|
|
248
|
+
# Check for worn icon with lpActive class (only active items have lpActive)
|
|
249
|
+
# Try CSS selector first - Nokogiri should handle multiple classes
|
|
250
|
+
worn_active = element.at_css('i.lpSprite.lpWorn.lpActive')
|
|
251
|
+
return true if worn_active
|
|
252
|
+
|
|
253
|
+
# Fallback: check class attribute directly
|
|
254
|
+
worn_icon = element.at_css('i.lpSprite.lpWorn')
|
|
255
|
+
return false unless worn_icon
|
|
256
|
+
|
|
257
|
+
class_attr = worn_icon['class'].to_s
|
|
258
|
+
# Check if lpActive appears in the class string (handles extra spaces)
|
|
259
|
+
return true if class_attr.include?('lpActive')
|
|
260
|
+
|
|
261
|
+
false
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lighterpack_parser/version'
|
|
4
|
+
require_relative 'lighterpack_parser/parser'
|
|
5
|
+
|
|
6
|
+
module LighterpackParser
|
|
7
|
+
# Convenience method to parse a Lighterpack URL
|
|
8
|
+
def self.parse_url(url)
|
|
9
|
+
Parser.new(url: url).parse
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/lighterpack_parser/version'
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = 'lighterpack-parser'
|
|
7
|
+
spec.version = LighterpackParser::VERSION
|
|
8
|
+
spec.authors = ['Packlista Team']
|
|
9
|
+
spec.email = ['team@packlista.com']
|
|
10
|
+
|
|
11
|
+
spec.summary = 'Parser for Lighterpack lists'
|
|
12
|
+
spec.description = 'Parse Lighterpack HTML to extract list data including categories, items, weights, and metadata'
|
|
13
|
+
spec.homepage = 'https://github.com/alex-ross/lighterpack-parser'
|
|
14
|
+
spec.license = 'MIT'
|
|
15
|
+
|
|
16
|
+
spec.files = Dir['lib/**/*', 'spec/**/*', '*.md', '*.gemspec']
|
|
17
|
+
spec.require_paths = ['lib']
|
|
18
|
+
|
|
19
|
+
spec.add_dependency 'nokogiri', '~> 1.15'
|
|
20
|
+
spec.add_dependency 'httparty', '~> 0.21'
|
|
21
|
+
|
|
22
|
+
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
23
|
+
end
|