sitedog_parser 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/sitedog_cli +17 -1
- data/lib/service.rb +4 -2
- data/lib/service_factory.rb +71 -11
- data/lib/sitedog_parser/version.rb +1 -1
- data/lib/sitedog_parser.rb +48 -3
- data/lib/url_checker.rb +6 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 753510fc5e4c0a2d565f29d0e5b894f653f83c23cb3354d27a631a74852ea421
|
4
|
+
data.tar.gz: 3a2ec09307309a917f4530a937b7443b061c2e671bc5e84a627e487ee7536a3c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dcb05fa986c51a3a69da4bb812042777e6c878d778215af6466d58aad90f643691e64399a98e17d403e79b5eba607287ed6129323626d69c0bcd8136056893d2
|
7
|
+
data.tar.gz: cf4e7360906081835272daeaa168c927d9d2a46787d4328afe22a1db03e82de9167a07280d6e41cdf81eeb26ff2de644494185a07a1a26124a523016ee821b4d
|
data/bin/sitedog_cli
CHANGED
@@ -4,6 +4,7 @@ require 'bundler/setup'
|
|
4
4
|
require 'sitedog_parser'
|
5
5
|
require 'optparse'
|
6
6
|
require 'logger'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Set default options
|
9
10
|
options = {
|
@@ -132,10 +133,25 @@ end
|
|
132
133
|
begin
|
133
134
|
logger.debug "Processing file: #{file_path}"
|
134
135
|
|
135
|
-
#
|
136
|
+
# Load YAML to check raw data
|
137
|
+
raw_yaml = YAML.load_file(file_path)
|
138
|
+
if options[:debug]
|
139
|
+
logger.debug "Raw YAML data for debug:"
|
140
|
+
logger.debug raw_yaml.inspect
|
141
|
+
logger.debug ""
|
142
|
+
end
|
143
|
+
|
144
|
+
# Convert YAML to hash
|
136
145
|
data = SitedogParser::Parser.to_hash(file_path, { logger: logger })
|
137
146
|
logger.debug "Data converted to hash"
|
138
147
|
|
148
|
+
# Debug the parsed data
|
149
|
+
if options[:debug]
|
150
|
+
logger.debug "Parsed data structure:"
|
151
|
+
logger.debug data.inspect
|
152
|
+
logger.debug ""
|
153
|
+
end
|
154
|
+
|
139
155
|
# Convert to JSON based on formatting options
|
140
156
|
json_data = if options[:compact_children]
|
141
157
|
logger.debug "Generating JSON with compact inner objects"
|
data/lib/service.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
class Service < Data.define(:service, :url, :children, :image_url)
|
2
|
-
def initialize(service:, url: nil, children: [], image_url: nil)
|
1
|
+
class Service < Data.define(:service, :url, :children, :image_url, :properties, :value)
|
2
|
+
def initialize(service:, url: nil, children: [], image_url: nil, properties: {}, value: nil)
|
3
3
|
raise ArgumentError, "Service cannot be empty" if service.nil? || service.empty?
|
4
4
|
|
5
5
|
service => String
|
6
6
|
url => String if url
|
7
7
|
children => Array if children
|
8
8
|
image_url => String if image_url
|
9
|
+
properties => Hash if properties
|
10
|
+
# value может быть любого типа, поэтому не проверяем
|
9
11
|
|
10
12
|
super
|
11
13
|
end
|
data/lib/service_factory.rb
CHANGED
@@ -61,6 +61,10 @@ class ServiceFactory
|
|
61
61
|
in Hash
|
62
62
|
logger.debug "hash: #{data}"
|
63
63
|
|
64
|
+
# Check if all values are URL-like strings
|
65
|
+
all_url_like = data.values.all? { |v| v.is_a?(String) && UrlChecker.url_like?(v) }
|
66
|
+
logger.debug "All values are URL-like: #{all_url_like}, values: #{data.values.map { |v| "#{v.class}: #{v}" }.join(', ')}"
|
67
|
+
|
64
68
|
# Protection from nil values in key fields
|
65
69
|
if (data.key?(:service) || data.key?("service")) &&
|
66
70
|
(data[:service].nil? || data["service"].nil?)
|
@@ -77,6 +81,8 @@ class ServiceFactory
|
|
77
81
|
# Первый приоритет - поиск в словаре по URL
|
78
82
|
child_dict_entry = dictionary.match(url_value)
|
79
83
|
|
84
|
+
logger.debug "Child for #{key}: service_name=#{service_name}, url=#{url_value}, dict_entry=#{child_dict_entry}"
|
85
|
+
|
80
86
|
if child_dict_entry && child_dict_entry['name']
|
81
87
|
# Если нашли запись в словаре по URL, используем её имя вместо ключа
|
82
88
|
service_name = child_dict_entry['name']
|
@@ -103,10 +109,45 @@ class ServiceFactory
|
|
103
109
|
|
104
110
|
# Create parent service with child elements
|
105
111
|
if service_type && children.any?
|
112
|
+
logger.debug "Returning service for #{service_type} with #{children.size} children"
|
106
113
|
return Service.new(service: service_type.to_s, children: children)
|
107
114
|
elsif children.size == 1
|
108
|
-
# If only one service and no service_type, return it
|
115
|
+
# If only one service and no service_type, return it
|
116
|
+
logger.debug "Returning single child service (no service_type)"
|
109
117
|
return children.first
|
118
|
+
else
|
119
|
+
logger.debug "Not returning a service for #{data.inspect}, service_type=#{service_type}, children.size=#{children.size}"
|
120
|
+
end
|
121
|
+
# 1.5 Check if hash contains at least some URL-like strings
|
122
|
+
elsif data.values.any? { |v| v.is_a?(String) && UrlChecker.url_like?(v) }
|
123
|
+
logger.debug "hash with some URL-like values: #{data.inspect}"
|
124
|
+
|
125
|
+
# Debug: Check each value for URL-like
|
126
|
+
data.each do |k, v|
|
127
|
+
if v.is_a?(String)
|
128
|
+
logger.debug " Checking #{k}: #{v} - URL-like? #{UrlChecker.url_like?(v)}"
|
129
|
+
else
|
130
|
+
logger.debug " Skipping non-string #{k}: #{v.class}"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Сохраняем все значения в properties, сохраняя порядок
|
135
|
+
properties = {}
|
136
|
+
data.each do |key, value|
|
137
|
+
properties[key.to_s] = value
|
138
|
+
logger.debug "Added property for #{key}: #{value}"
|
139
|
+
end
|
140
|
+
|
141
|
+
# Create service with properties only
|
142
|
+
if !properties.empty?
|
143
|
+
service = Service.new(
|
144
|
+
service: service_type.to_s,
|
145
|
+
url: nil,
|
146
|
+
properties: properties,
|
147
|
+
children: [] # Пустой массив children
|
148
|
+
)
|
149
|
+
logger.debug "Returning service with #{properties.size} properties"
|
150
|
+
return service
|
110
151
|
end
|
111
152
|
end
|
112
153
|
|
@@ -225,19 +266,38 @@ class ServiceFactory
|
|
225
266
|
in Array
|
226
267
|
logger.debug "array: #{data}"
|
227
268
|
|
228
|
-
# Create services from array elements
|
229
|
-
children =
|
269
|
+
# Create services from all array elements for children
|
270
|
+
children = []
|
271
|
+
data.each_with_index do |item, index|
|
272
|
+
# Для URL-подобных строк используем стандартный механизм
|
273
|
+
if item.is_a?(String) && UrlChecker.url_like?(item)
|
274
|
+
child_service = create(item, service_type, dictionary_path, options)
|
275
|
+
children << child_service if child_service
|
276
|
+
else
|
277
|
+
# Для простых значений создаем сервис с value
|
278
|
+
child_service = Service.new(
|
279
|
+
service: service_type ? service_type.to_s : "value",
|
280
|
+
url: nil,
|
281
|
+
properties: {},
|
282
|
+
value: item # Используем поле value
|
283
|
+
)
|
284
|
+
children << child_service
|
285
|
+
logger.debug "Created service with value for item #{index}: #{item.inspect}"
|
286
|
+
end
|
287
|
+
end
|
230
288
|
|
231
|
-
#
|
232
|
-
if
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
289
|
+
# Return service with all items as children
|
290
|
+
if service_type
|
291
|
+
result = Service.new(
|
292
|
+
service: service_type.to_s,
|
293
|
+
url: nil,
|
294
|
+
children: children
|
295
|
+
)
|
296
|
+
logger.debug "Returning array service with #{children.size} children"
|
297
|
+
return result
|
237
298
|
end
|
238
299
|
|
239
|
-
#
|
240
|
-
# return nil
|
300
|
+
# Fallback to nil if no service_type
|
241
301
|
return nil
|
242
302
|
else
|
243
303
|
# Handle values that don't match any pattern
|
data/lib/sitedog_parser.rb
CHANGED
@@ -68,9 +68,16 @@ module SitedogParser
|
|
68
68
|
# Для обычных полей создаем сервис
|
69
69
|
service = ServiceFactory.create(data, service_type, dictionary_path, options)
|
70
70
|
|
71
|
+
# Debug output
|
72
|
+
if logger
|
73
|
+
logger.debug "ServiceFactory.create for #{service_type}: #{service.inspect}"
|
74
|
+
end
|
75
|
+
|
71
76
|
if service
|
72
77
|
services[service_type] ||= []
|
73
78
|
services[service_type] << service
|
79
|
+
elsif logger
|
80
|
+
logger.debug "Service for #{service_type} is nil, field will be skipped"
|
74
81
|
end
|
75
82
|
end
|
76
83
|
end
|
@@ -103,11 +110,49 @@ module SitedogParser
|
|
103
110
|
if service_data.is_a?(Array) && service_data.first.is_a?(Service)
|
104
111
|
# Преобразуем массив сервисов в массив хешей
|
105
112
|
result[domain_key][service_type_key] = service_data.map do |service|
|
106
|
-
{
|
113
|
+
service_hash = {
|
107
114
|
'service' => service.service,
|
108
|
-
'url' => service.url
|
109
|
-
'children' => service.children.map { |child| {'service' => child.service, 'url' => child.url} }
|
115
|
+
'url' => service.url
|
110
116
|
}
|
117
|
+
|
118
|
+
# Добавляем image_url если он есть
|
119
|
+
if service.image_url
|
120
|
+
service_hash['image_url'] = service.image_url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Добавляем children только если они есть
|
124
|
+
if service.children && !service.children.empty?
|
125
|
+
service_hash['children'] = service.children.map do |child|
|
126
|
+
child_hash = {
|
127
|
+
'service' => child.service,
|
128
|
+
'url' => child.url
|
129
|
+
}
|
130
|
+
|
131
|
+
# Добавляем image_url для детей если он есть
|
132
|
+
if child.image_url
|
133
|
+
child_hash['image_url'] = child.image_url
|
134
|
+
end
|
135
|
+
|
136
|
+
# Добавляем properties для children если они есть
|
137
|
+
if child.properties && !child.properties.empty?
|
138
|
+
child_hash['properties'] = child.properties
|
139
|
+
end
|
140
|
+
|
141
|
+
# Добавляем value для children если оно есть
|
142
|
+
if child.value
|
143
|
+
child_hash['value'] = child.value
|
144
|
+
end
|
145
|
+
|
146
|
+
child_hash
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# Добавляем properties, если они есть
|
151
|
+
if service.properties && !service.properties.empty?
|
152
|
+
service_hash['properties'] = service.properties
|
153
|
+
end
|
154
|
+
|
155
|
+
service_hash
|
111
156
|
end
|
112
157
|
else
|
113
158
|
# Сохраняем простые поля как есть
|
data/lib/url_checker.rb
CHANGED
@@ -28,7 +28,7 @@ module UrlChecker
|
|
28
28
|
end
|
29
29
|
|
30
30
|
# Check for standard URLs
|
31
|
-
pattern = /^((?:https?|ftp|sftp|ftps|ssh|git|ws|wss):\/\/)?[a-zA-Z0-9][-a-zA-Z0-9.]+\.[a-zA-Z]{2,}(:[0-9]+)?(\/[-a-zA-Z0-9%_.~#+]*)*(\?[-a-zA-Z0-9%_&=.~#+]*)?(#[-a-zA-Z0-9%_&=.~#+\/]*)?$/
|
31
|
+
pattern = /^((?:https?|ftp|sftp|ftps|ssh|git|ws|wss):\/\/)?((?:[a-zA-Z0-9][-a-zA-Z0-9.]+\.[a-zA-Z]{2,})|(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(:[0-9]+)?(\/[-a-zA-Z0-9%_.~#+]*)*(\?[-a-zA-Z0-9%_&=.~#+]*)?(#[-a-zA-Z0-9%_&=.~#+\/]*)?$/
|
32
32
|
|
33
33
|
!!string.match(pattern)
|
34
34
|
end
|
@@ -61,6 +61,11 @@ module UrlChecker
|
|
61
61
|
# Remove protocol and www prefix if present
|
62
62
|
domain = url.gsub(%r{^(?:https?://)?(?:www\.)?}, "")
|
63
63
|
|
64
|
+
# Check if it's an IP address
|
65
|
+
if domain.match?(/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/)
|
66
|
+
return "IP Address"
|
67
|
+
end
|
68
|
+
|
64
69
|
# Extract domain from URL by removing everything after first / or : or ? or #
|
65
70
|
domain = domain.split(/[:\/?#]/).first
|
66
71
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitedog_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Nemytchenko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-05-
|
11
|
+
date: 2025-05-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|