sitedog_parser 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +47 -225
- data/bin/analyze_dictionary +1 -1
- data/bin/sitedog_cli +172 -0
- data/lib/service.rb +7 -2
- data/lib/service_factory.rb +99 -18
- data/lib/sitedog_parser/version.rb +1 -1
- data/lib/sitedog_parser.rb +66 -42
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04e6beaed38d60a1269ed3f20b071f9576eb6c4c40142fbb65fdaa4bd3bac14f
|
4
|
+
data.tar.gz: acf60abb7739f59e4b5732f95286a4e14f386ceaf1ad7d9b971d1fabc3886a87
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7825cb9106d8861d3084ca9eee9f0c52d0dfbfbc409a0a2320cd89e1487a8ed8334fdf1e1d629ab6b65d25da3e514f4dfdf059be88bdd2bad23f6d08e8ca2b63
|
7
|
+
data.tar.gz: ccad1201ba56171fd64a9869dfa538bd24ba6cc44feb0fc52f40c17962a6c582dd1589c375226ae21f289dabe67097c70eb92ad807b161ac6e40ff10b3133a55
|
data/README.md
CHANGED
@@ -4,29 +4,13 @@ A library for parsing and classifying web services from YAML files into structur
|
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
|
-
Add this line to your application's Gemfile:
|
8
|
-
|
9
7
|
```ruby
|
10
8
|
gem 'sitedog_parser'
|
11
9
|
```
|
12
10
|
|
13
|
-
Then execute:
|
14
|
-
|
15
|
-
```bash
|
16
|
-
$ bundle install
|
17
|
-
```
|
18
|
-
|
19
|
-
Or install it yourself:
|
20
|
-
|
21
|
-
```bash
|
22
|
-
$ gem install sitedog_parser
|
23
|
-
```
|
24
|
-
|
25
11
|
## Usage
|
26
12
|
|
27
|
-
###
|
28
|
-
|
29
|
-
The easiest way to use SitedogParser is through its high-level interface:
|
13
|
+
### Basic Usage
|
30
14
|
|
31
15
|
```ruby
|
32
16
|
require 'sitedog_parser'
|
@@ -34,256 +18,94 @@ require 'sitedog_parser'
|
|
34
18
|
# Parse from a YAML file
|
35
19
|
parsed_data = SitedogParser::Parser.parse_file('data.yml')
|
36
20
|
|
37
|
-
# Or parse from a hash (if you already loaded the YAML)
|
38
|
-
yaml_data = YAML.load_file('data.yml', symbolize_names: true)
|
39
|
-
parsed_data = SitedogParser::Parser.parse(yaml_data)
|
40
|
-
|
41
|
-
# Get all services of a specific type across all domains
|
42
|
-
all_hosting_services = SitedogParser::Parser.get_services_by_type(parsed_data, :hosting)
|
43
|
-
all_hosting_services.each do |service|
|
44
|
-
puts "Hosting service: #{service.service}, URL: #{service.url}"
|
45
|
-
end
|
46
|
-
|
47
|
-
# Get all domain names
|
48
|
-
domain_names = SitedogParser::Parser.get_domain_names(parsed_data)
|
49
|
-
puts "Found domains: #{domain_names.join(', ')}"
|
50
|
-
|
51
21
|
# Working with specific domain's services
|
52
22
|
domain_services = parsed_data['example.com']
|
53
|
-
if domain_services[:
|
54
|
-
puts "
|
23
|
+
if domain_services[:hosting]
|
24
|
+
puts "Hosting: #{domain_services[:hosting].first.service}"
|
25
|
+
puts "URL: #{domain_services[:hosting].first.url}"
|
55
26
|
end
|
56
27
|
```
|
57
28
|
|
58
|
-
###
|
29
|
+
### Simple Fields
|
59
30
|
|
60
|
-
You can specify which fields should be treated as simple
|
31
|
+
You can specify which fields should be treated as simple values:
|
61
32
|
|
62
33
|
```ruby
|
63
|
-
# Define
|
64
|
-
simple_fields = [:project, :role, :environment, :registry
|
34
|
+
# Define simple fields
|
35
|
+
simple_fields = [:project, :role, :environment, :registry]
|
65
36
|
|
66
37
|
# Parse with simple fields
|
67
38
|
parsed_data = SitedogParser::Parser.parse(yaml_data, simple_fields: simple_fields)
|
68
39
|
|
69
|
-
# Now you can access these fields directly as strings
|
70
|
-
domain_services = parsed_data['example.com']
|
71
|
-
if domain_services[:project]
|
72
|
-
puts "Project: #{domain_services[:project]}" # This is a string, not a Service object
|
73
|
-
end
|
74
|
-
|
75
40
|
# Find domains with a specific field value
|
76
|
-
|
77
|
-
puts "Production domains: #{domains_with_production.join(', ')}"
|
41
|
+
production_domains = SitedogParser::Parser.get_domains_by_field_value(parsed_data, :environment, 'production')
|
78
42
|
```
|
79
43
|
|
80
|
-
###
|
81
|
-
|
82
|
-
You can use the DictionaryAnalyzer to find services that might be missing from your dictionary:
|
44
|
+
### Export to JSON
|
83
45
|
|
84
46
|
```ruby
|
85
|
-
|
86
|
-
|
47
|
+
# Standard output
|
48
|
+
json_data = SitedogParser::Parser.to_json('services.yml')
|
87
49
|
|
88
|
-
#
|
89
|
-
|
50
|
+
# Or via command line:
|
51
|
+
# $ sitedog_cli services.yml > services.json
|
90
52
|
|
91
|
-
#
|
92
|
-
|
93
|
-
|
94
|
-
# Generate a report
|
95
|
-
report = SitedogParser::DictionaryAnalyzer.report(parsed_data)
|
96
|
-
puts report
|
97
|
-
|
98
|
-
# Or use the provided script
|
99
|
-
# bin/analyze_dictionary data.yml
|
53
|
+
# Compact JSON for inner objects
|
54
|
+
# $ sitedog_cli -C services.yml > services.json
|
100
55
|
```
|
101
56
|
|
102
|
-
|
103
|
-
1. A list of services that are missing from the dictionary
|
104
|
-
2. How many domains use each service
|
105
|
-
3. In which context (service type) each service is used
|
106
|
-
4. A YAML template ready to be added to your dictionary
|
107
|
-
|
108
|
-
### Example: Processing a YAML Configuration
|
109
|
-
|
110
|
-
Input YAML file (`services.yml`):
|
111
|
-
|
112
|
-
```yaml
|
113
|
-
example.com:
|
114
|
-
hosting: https://aws.amazon.com
|
115
|
-
dns:
|
116
|
-
service: cloudflare
|
117
|
-
url: https://cloudflare.com
|
118
|
-
registrar: namecheap
|
119
|
-
ssl: letsencrypt
|
120
|
-
repo: https://github.com/example/repo
|
121
|
-
|
122
|
-
another-site.org:
|
123
|
-
hosting:
|
124
|
-
service: digitalocean
|
125
|
-
url: https://digitalocean.com
|
126
|
-
cdn: https://cloudfront.aws.amazon.com
|
127
|
-
dns: https://domains.google.com
|
128
|
-
```
|
129
|
-
|
130
|
-
Processing this file:
|
131
|
-
|
132
|
-
```ruby
|
133
|
-
require 'sitedog_parser'
|
134
|
-
|
135
|
-
# Parse the file
|
136
|
-
data = SitedogParser::Parser.parse_file('services.yml')
|
137
|
-
|
138
|
-
# Get all domains
|
139
|
-
puts "Domains: #{SitedogParser::Parser.get_domain_names(data).join(', ')}"
|
57
|
+
### JSON Structure Example
|
140
58
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
dns_services = SitedogParser::Parser.get_services_by_type(data, :dns)
|
150
|
-
puts "\nDNS services:"
|
151
|
-
dns_services.each do |service|
|
152
|
-
puts "- #{service.service}: #{service.url}"
|
153
|
-
end
|
154
|
-
|
155
|
-
# Access a specific domain's services
|
156
|
-
puts "\nServices for example.com:"
|
157
|
-
example_services = data['example.com']
|
158
|
-
example_services.each do |type, services|
|
159
|
-
puts "#{type}: #{services.first.service}"
|
160
|
-
end
|
161
|
-
```
|
162
|
-
|
163
|
-
Output:
|
164
|
-
```
|
165
|
-
Domains: example.com, another-site.org
|
166
|
-
|
167
|
-
Hosting services:
|
168
|
-
- Amazon Web Services: https://aws.amazon.com
|
169
|
-
- Digitalocean: https://digitalocean.com
|
170
|
-
|
171
|
-
DNS services:
|
172
|
-
- Cloudflare: https://cloudflare.com
|
173
|
-
- Domains: https://domains.google.com
|
174
|
-
|
175
|
-
Services for example.com:
|
176
|
-
hosting: Amazon Web Services
|
177
|
-
dns: Cloudflare
|
178
|
-
registrar: Namecheap
|
179
|
-
ssl: Letsencrypt
|
180
|
-
repo: Github
|
59
|
+
```json
|
60
|
+
{
|
61
|
+
"example.com": {
|
62
|
+
"hosting": [{"service":"Amazon Web Services","url":"https://aws.amazon.com"}],
|
63
|
+
"dns": [{"service":"Cloudflare","url":"https://cloudflare.com"}],
|
64
|
+
"registrar": [{"service":"Namecheap","url":"https://namecheap.com"}]
|
65
|
+
}
|
66
|
+
}
|
181
67
|
```
|
182
68
|
|
183
|
-
### Service Object
|
184
|
-
|
185
|
-
Each service object has the following structure:
|
69
|
+
### Service Object
|
186
70
|
|
187
71
|
```ruby
|
188
|
-
# Service fields
|
189
72
|
service.service # Name of the service (capitalized string)
|
190
73
|
service.url # URL of the service (string or nil)
|
191
74
|
service.children # Child services (array of Service objects, empty if none)
|
192
75
|
```
|
193
76
|
|
194
|
-
###
|
195
|
-
|
196
|
-
SitedogParser's strength is in normalizing different data formats into a consistent structure. Here are examples showing how various input formats are handled:
|
197
|
-
|
198
|
-
#### 1. Simple URL string
|
199
|
-
```ruby
|
200
|
-
# Input
|
201
|
-
data = "https://github.com/username/repo"
|
77
|
+
### Supported Data Formats
|
202
78
|
|
203
|
-
|
204
|
-
service = ServiceFactory.create(data)
|
205
|
-
service.service # => "Github"
|
206
|
-
service.url # => "https://github.com"
|
207
|
-
service.children # => []
|
208
|
-
```
|
79
|
+
The library handles various data formats:
|
209
80
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
81
|
+
1. **URL strings**: `"https://github.com/username/repo"` → GitHub service
|
82
|
+
2. **Service names**: `"GitHub"` → GitHub service with URL
|
83
|
+
3. **Hashes with service and URL**: `{service: "Github", url: "https://github.com/repo"}`
|
84
|
+
4. **Nested hashes** with service types
|
85
|
+
5. **Hashes with URLs** as values
|
214
86
|
|
215
|
-
|
216
|
-
service = ServiceFactory.create(data)
|
217
|
-
service.service # => "GitHub"
|
218
|
-
service.url # => "https://github.com"
|
219
|
-
service.children # => []
|
220
|
-
```
|
87
|
+
### Dictionary Analysis
|
221
88
|
|
222
|
-
#### 3. Hash with service and URL
|
223
89
|
```ruby
|
224
|
-
#
|
225
|
-
|
226
|
-
service: "Github",
|
227
|
-
url: "https://github.com/username/repo"
|
228
|
-
}
|
90
|
+
# Find candidates for the dictionary (services with name but no URL)
|
91
|
+
candidates = SitedogParser::DictionaryAnalyzer.find_dictionary_candidates(parsed_data)
|
229
92
|
|
230
|
-
#
|
231
|
-
|
232
|
-
service.service # => "Github"
|
233
|
-
service.url # => "https://github.com/username/repo"
|
234
|
-
service.children # => []
|
93
|
+
# Generate a report
|
94
|
+
report = SitedogParser::DictionaryAnalyzer.report(parsed_data)
|
235
95
|
```
|
236
96
|
|
237
|
-
|
238
|
-
```ruby
|
239
|
-
# Input
|
240
|
-
data = {
|
241
|
-
dns: {
|
242
|
-
service: "route53",
|
243
|
-
url: "https://console.aws.amazon.com/route53"
|
244
|
-
},
|
245
|
-
registrar: {
|
246
|
-
service: "namecheap",
|
247
|
-
url: "https://namecheap.com"
|
248
|
-
}
|
249
|
-
}
|
97
|
+
### Command Line Options
|
250
98
|
|
251
|
-
# Output
|
252
|
-
service = ServiceFactory.create(data)
|
253
|
-
service.service # => "Unknown"
|
254
|
-
service.children.size # => 2
|
255
|
-
service.children[0].service # => "Route53"
|
256
|
-
service.children[0].url # => "https://console.aws.amazon.com/route53"
|
257
|
-
service.children[1].service # => "Namecheap"
|
258
|
-
service.children[1].url # => "https://namecheap.com"
|
259
99
|
```
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
}
|
268
|
-
|
269
|
-
# Output
|
270
|
-
service = ServiceFactory.create(data)
|
271
|
-
service.service # => "Unknown"
|
272
|
-
service.children.size # => 2
|
273
|
-
service.children[0].service # => "Hosting"
|
274
|
-
service.children[0].url # => "https://aws.amazon.com"
|
275
|
-
service.children[1].service # => "Cdn"
|
276
|
-
service.children[1].url # => "https://cloudflare.com"
|
100
|
+
$ sitedog_cli --help
|
101
|
+
Usage: sitedog_cli [options] <path_to_yaml_file> [output_file]
|
102
|
+
-d, --debug Enable debug output
|
103
|
+
-c, --compact Compact JSON without formatting
|
104
|
+
-C, --compact-children Formatted JSON with compact inner objects
|
105
|
+
-q, --quiet Suppress non-error messages
|
106
|
+
-h, --help Show this help message
|
277
107
|
```
|
278
108
|
|
279
|
-
## Development and Contribution
|
280
|
-
|
281
|
-
1. Fork the repository
|
282
|
-
2. Create a branch for your changes (`git checkout -b my-new-feature`)
|
283
|
-
3. Commit your changes (`git commit -am 'Add new feature'`)
|
284
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
285
|
-
5. Create a Pull Request
|
286
|
-
|
287
109
|
## License
|
288
110
|
|
289
|
-
|
111
|
+
MIT
|
data/bin/analyze_dictionary
CHANGED
@@ -42,7 +42,7 @@ begin
|
|
42
42
|
end
|
43
43
|
|
44
44
|
# Определяем простые поля, которые не должны рассматриваться как сервисы
|
45
|
-
simple_fields = [:project, :role, :environment, :registry
|
45
|
+
simple_fields = [:project, :role, :environment, :registry]
|
46
46
|
|
47
47
|
# Анализируем данные через наш интерфейс Parser
|
48
48
|
data = SitedogParser::Parser.parse(sites_data, simple_fields: simple_fields, dictionary_path: dictionary_path)
|
data/bin/sitedog_cli
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'sitedog_parser'
|
5
|
+
require 'optparse'
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
# Set default options
|
9
|
+
options = {
|
10
|
+
debug: false,
|
11
|
+
pretty: true,
|
12
|
+
compact_children: false,
|
13
|
+
output: nil,
|
14
|
+
log_level: Logger::INFO
|
15
|
+
}
|
16
|
+
|
17
|
+
# Create logger
|
18
|
+
logger = Logger.new(STDOUT)
|
19
|
+
logger.formatter = proc do |severity, datetime, progname, msg|
|
20
|
+
"#{msg}\n"
|
21
|
+
end
|
22
|
+
|
23
|
+
# Command line options parser
|
24
|
+
option_parser = OptionParser.new do |opts|
|
25
|
+
opts.banner = "Usage: sitedog_cli [options] <path_to_yaml_file> [output_file]"
|
26
|
+
|
27
|
+
opts.on("-d", "--debug", "Enable debug output") do
|
28
|
+
options[:debug] = true
|
29
|
+
options[:log_level] = Logger::DEBUG
|
30
|
+
end
|
31
|
+
|
32
|
+
opts.on("-c", "--compact", "Output compact JSON (without pretty formatting)") do
|
33
|
+
options[:pretty] = false
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on("--compact-children", "-C", "Format JSON with compact inner objects (one line per service)") do
|
37
|
+
options[:compact_children] = true
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on("-q", "--quiet", "Suppress non-error messages") do
|
41
|
+
options[:log_level] = Logger::ERROR
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on("-h", "--help", "Show this help message") do
|
45
|
+
puts opts
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Parse options
|
51
|
+
option_parser.parse!(ARGV)
|
52
|
+
|
53
|
+
# Set logging level
|
54
|
+
logger.level = options[:log_level]
|
55
|
+
|
56
|
+
# Check command line arguments
|
57
|
+
if ARGV.empty? || ARGV.size < 1 || ARGV.size > 2
|
58
|
+
logger.error option_parser.help
|
59
|
+
exit 1
|
60
|
+
end
|
61
|
+
|
62
|
+
file_path = ARGV[0]
|
63
|
+
output_path = ARGV[1]
|
64
|
+
|
65
|
+
# Check if input file exists
|
66
|
+
unless File.exist?(file_path)
|
67
|
+
logger.error "Error: File '#{file_path}' not found."
|
68
|
+
exit 1
|
69
|
+
end
|
70
|
+
|
71
|
+
# Redirect STDERR to hide debug output if not in debug mode
|
72
|
+
unless options[:debug]
|
73
|
+
original_stderr = $stderr.dup
|
74
|
+
$stderr.reopen(File.open(File::NULL, 'w'))
|
75
|
+
end
|
76
|
+
|
77
|
+
# Format JSON with inner objects on a single line
|
78
|
+
def compact_json_generate(data)
|
79
|
+
# Clone data to avoid modifying the original
|
80
|
+
formatted_data = Marshal.load(Marshal.dump(data))
|
81
|
+
|
82
|
+
# Process each domain
|
83
|
+
formatted_data.each do |domain_key, services|
|
84
|
+
# For each service type
|
85
|
+
services.each do |service_type, service_array|
|
86
|
+
if service_array.is_a?(Array)
|
87
|
+
# Convert service array to compact format
|
88
|
+
services[service_type] = service_array.map do |service_obj|
|
89
|
+
# Remove empty children arrays for compactness
|
90
|
+
if service_obj['children'] && service_obj['children'].empty?
|
91
|
+
service_obj.delete('children')
|
92
|
+
end
|
93
|
+
service_obj
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Convert to JSON with indentation at top level, but compact inner objects
|
100
|
+
output = "{\n"
|
101
|
+
|
102
|
+
formatted_data.each_with_index do |(domain, services), domain_index|
|
103
|
+
output << " \"#{domain}\": {\n"
|
104
|
+
|
105
|
+
services.keys.sort.each_with_index do |service_type, service_index|
|
106
|
+
service_data = services[service_type]
|
107
|
+
|
108
|
+
# Start line with service type
|
109
|
+
output << " \"#{service_type}\": "
|
110
|
+
|
111
|
+
# Special formatting for service arrays - on a single line
|
112
|
+
if service_data.is_a?(Array)
|
113
|
+
items = service_data.map { |item| JSON.generate(item) }.join(",")
|
114
|
+
output << "[#{items}]"
|
115
|
+
else
|
116
|
+
# For non-arrays use standard JSON.generate
|
117
|
+
output << JSON.generate(service_data)
|
118
|
+
end
|
119
|
+
|
120
|
+
# Add comma for all elements except the last one
|
121
|
+
output << (service_index < services.keys.size - 1 ? ",\n" : "\n")
|
122
|
+
end
|
123
|
+
|
124
|
+
output << " }"
|
125
|
+
output << (domain_index < formatted_data.size - 1 ? ",\n" : "\n")
|
126
|
+
end
|
127
|
+
|
128
|
+
output << "}"
|
129
|
+
output
|
130
|
+
end
|
131
|
+
|
132
|
+
begin
|
133
|
+
logger.debug "Processing file: #{file_path}"
|
134
|
+
|
135
|
+
# Convert YAML to JSON
|
136
|
+
data = SitedogParser::Parser.to_hash(file_path, { logger: logger })
|
137
|
+
logger.debug "Data converted to hash"
|
138
|
+
|
139
|
+
# Convert to JSON based on formatting options
|
140
|
+
json_data = if options[:compact_children]
|
141
|
+
logger.debug "Generating JSON with compact inner objects"
|
142
|
+
compact_json_generate(data)
|
143
|
+
elsif options[:pretty]
|
144
|
+
logger.debug "Generating pretty JSON"
|
145
|
+
JSON.pretty_generate(data)
|
146
|
+
else
|
147
|
+
logger.debug "Generating compact JSON"
|
148
|
+
JSON.generate(data)
|
149
|
+
end
|
150
|
+
|
151
|
+
# If output file is specified, save result to it
|
152
|
+
if output_path
|
153
|
+
logger.debug "Saving to file: #{output_path}"
|
154
|
+
File.write(output_path, json_data)
|
155
|
+
logger.info "JSON data successfully saved to '#{output_path}'."
|
156
|
+
else
|
157
|
+
# Otherwise print JSON to screen
|
158
|
+
# Use puts directly for JSON output to avoid logger prefixes
|
159
|
+
puts json_data
|
160
|
+
end
|
161
|
+
|
162
|
+
rescue => e
|
163
|
+
# Restore STDERR for error messages
|
164
|
+
$stderr.reopen(original_stderr) unless options[:debug]
|
165
|
+
|
166
|
+
logger.error "Error processing file: #{e.message}"
|
167
|
+
logger.debug e.backtrace.join("\n") if options[:debug]
|
168
|
+
exit 1
|
169
|
+
ensure
|
170
|
+
# Restore STDERR
|
171
|
+
$stderr.reopen(original_stderr) unless options[:debug]
|
172
|
+
end
|
data/lib/service.rb
CHANGED
@@ -1,11 +1,16 @@
|
|
1
|
-
class Service < Data.define(:service, :url, :children)
|
2
|
-
def initialize(service:, url: nil, children: [])
|
1
|
+
class Service < Data.define(:service, :url, :children, :image_url)
|
2
|
+
def initialize(service:, url: nil, children: [], image_url: nil)
|
3
3
|
raise ArgumentError, "Service cannot be empty" if service.nil? || service.empty?
|
4
4
|
|
5
5
|
service => String
|
6
6
|
url => String if url
|
7
7
|
children => Array if children
|
8
|
+
image_url => String if image_url
|
8
9
|
|
9
10
|
super
|
10
11
|
end
|
12
|
+
|
13
|
+
def has_children?
|
14
|
+
!children.empty?
|
15
|
+
end
|
11
16
|
end
|
data/lib/service_factory.rb
CHANGED
@@ -2,6 +2,7 @@ require 'pry'
|
|
2
2
|
require_relative 'url_checker'
|
3
3
|
require_relative 'dictionary'
|
4
4
|
require_relative 'service'
|
5
|
+
require 'logger'
|
5
6
|
|
6
7
|
# Factory for creating Service objects from different data formats
|
7
8
|
class ServiceFactory
|
@@ -10,19 +11,26 @@ class ServiceFactory
|
|
10
11
|
# @param data [String, Hash, Array] data for creating service
|
11
12
|
# @param service_type [Symbol] service type (used as fallback)
|
12
13
|
# @param dictionary_path [String, nil] path to the dictionary file (optional)
|
14
|
+
# @param options [Hash] дополнительные опции
|
15
|
+
# @option options [Logger] :logger логгер для вывода сообщений
|
13
16
|
# @return [Service] created service object
|
14
|
-
def self.create(data, service_type = nil, dictionary_path = nil)
|
17
|
+
def self.create(data, service_type = nil, dictionary_path = nil, options = {})
|
18
|
+
# Получаем логгер из опций или создаем пустой логгер, пишущий в nil
|
19
|
+
logger = options[:logger] || Logger.new(nil)
|
20
|
+
|
15
21
|
# Check for nil
|
16
22
|
return nil if data.nil?
|
17
23
|
|
18
24
|
slug = nil
|
19
25
|
url = nil
|
20
26
|
dictionary = Dictionary.new(dictionary_path)
|
27
|
+
dict_entry = nil
|
21
28
|
|
22
29
|
case data
|
23
30
|
in String if UrlChecker.url_like?(data) # url
|
24
31
|
url = UrlChecker.normalize_url(data)
|
25
|
-
|
32
|
+
dict_entry = dictionary.match(url)
|
33
|
+
slug = dict_entry&.dig('name')
|
26
34
|
|
27
35
|
# If not found in dictionary and service_type exists, use it
|
28
36
|
if slug.nil? && service_type
|
@@ -32,17 +40,26 @@ class ServiceFactory
|
|
32
40
|
slug = UrlChecker.extract_name(url) if slug.nil?
|
33
41
|
end
|
34
42
|
|
35
|
-
|
43
|
+
logger.debug "url: #{slug} <- #{url}"
|
36
44
|
in String if !UrlChecker.url_like?(data) # slug
|
37
45
|
slug = data
|
38
|
-
|
39
|
-
|
46
|
+
dict_entry = dictionary.lookup(slug)
|
47
|
+
|
48
|
+
# Если нашли запись в словаре, используем её имя
|
49
|
+
if dict_entry && dict_entry['name']
|
50
|
+
slug = dict_entry['name']
|
51
|
+
end
|
52
|
+
|
53
|
+
url = dict_entry&.dig('url')
|
54
|
+
logger.debug "slug: #{slug} -> #{url}"
|
40
55
|
in { service: String => service_slug, url: String => service_url }
|
41
56
|
slug = service_slug.to_s.capitalize
|
42
57
|
url = service_url
|
43
|
-
|
58
|
+
# Поиск в словаре после получения slug
|
59
|
+
dict_entry = dictionary.lookup(slug)
|
60
|
+
logger.debug "hash: #{slug} + #{url}"
|
44
61
|
in Hash
|
45
|
-
|
62
|
+
logger.debug "hash: #{data}"
|
46
63
|
|
47
64
|
# Protection from nil values in key fields
|
48
65
|
if (data.key?(:service) || data.key?("service")) &&
|
@@ -52,12 +69,35 @@ class ServiceFactory
|
|
52
69
|
|
53
70
|
# 1. Check if hash contains only URL-like strings (list of services)
|
54
71
|
if data.values.all? { |v| v.is_a?(String) && UrlChecker.url_like?(v) }
|
55
|
-
|
72
|
+
logger.debug "hash with services: #{data.keys.join(', ')}"
|
56
73
|
# Create array of child services
|
57
74
|
children = []
|
58
75
|
data.each do |key, url_value|
|
59
76
|
service_name = key.to_s
|
60
|
-
|
77
|
+
# Первый приоритет - поиск в словаре по URL
|
78
|
+
child_dict_entry = dictionary.match(url_value)
|
79
|
+
|
80
|
+
if child_dict_entry && child_dict_entry['name']
|
81
|
+
# Если нашли запись в словаре по URL, используем её имя вместо ключа
|
82
|
+
service_name = child_dict_entry['name']
|
83
|
+
else
|
84
|
+
# Если записи в словаре нет по URL, ищем по имени
|
85
|
+
key_dict_entry = dictionary.lookup(service_name)
|
86
|
+
if key_dict_entry && key_dict_entry['name']
|
87
|
+
service_name = key_dict_entry['name']
|
88
|
+
else
|
89
|
+
# Если не нашли в словаре ни по URL, ни по имени, капитализируем исходное имя
|
90
|
+
service_name = service_name.capitalize
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
child_image_url = child_dict_entry&.dig('image_url')
|
95
|
+
|
96
|
+
child_service = Service.new(
|
97
|
+
service: service_name,
|
98
|
+
url: url_value,
|
99
|
+
image_url: child_image_url
|
100
|
+
)
|
61
101
|
children << child_service
|
62
102
|
end
|
63
103
|
|
@@ -79,7 +119,15 @@ class ServiceFactory
|
|
79
119
|
url_key = data.key?(:url) ? :url : "url"
|
80
120
|
url_value = data[url_key]
|
81
121
|
|
82
|
-
|
122
|
+
# Ищем в словаре
|
123
|
+
service_dict_entry = dictionary.lookup(service_name) || dictionary.match(url_value)
|
124
|
+
service_image_url = service_dict_entry&.dig('image_url')
|
125
|
+
|
126
|
+
return Service.new(
|
127
|
+
service: service_name.capitalize,
|
128
|
+
url: url_value,
|
129
|
+
image_url: service_image_url
|
130
|
+
)
|
83
131
|
end
|
84
132
|
|
85
133
|
# 3. Process nested hashes
|
@@ -98,13 +146,28 @@ class ServiceFactory
|
|
98
146
|
url_key = value.key?(:url) ? :url : "url"
|
99
147
|
url_value = value[url_key]
|
100
148
|
|
101
|
-
|
149
|
+
# Ищем в словаре
|
150
|
+
nested_dict_entry = dictionary.lookup(service_name) || dictionary.match(url_value)
|
151
|
+
nested_image_url = nested_dict_entry&.dig('image_url')
|
152
|
+
|
153
|
+
child = Service.new(
|
154
|
+
service: service_name.capitalize,
|
155
|
+
url: url_value,
|
156
|
+
image_url: nested_image_url
|
157
|
+
)
|
102
158
|
# 3.2 If value has hash with only URL-like values
|
103
159
|
elsif value.values.all? { |v| v.is_a?(String) && UrlChecker.url_like?(v) }
|
104
160
|
child_children = []
|
105
161
|
|
106
162
|
value.each do |sub_key, url_value|
|
107
|
-
|
163
|
+
sub_dict_entry = dictionary.lookup(sub_key.to_s) || dictionary.match(url_value)
|
164
|
+
sub_image_url = sub_dict_entry&.dig('image_url')
|
165
|
+
|
166
|
+
child_children << Service.new(
|
167
|
+
service: sub_key.to_s.capitalize,
|
168
|
+
url: url_value,
|
169
|
+
image_url: sub_image_url
|
170
|
+
)
|
108
171
|
end
|
109
172
|
|
110
173
|
child = Service.new(service: key.to_s, children: child_children)
|
@@ -120,7 +183,14 @@ class ServiceFactory
|
|
120
183
|
value.each do |sub_key, sub_value|
|
121
184
|
if sub_value.is_a?(String) && UrlChecker.url_like?(sub_value)
|
122
185
|
has_urls = true
|
123
|
-
|
186
|
+
sub_dict_entry = dictionary.lookup(sub_key.to_s) || dictionary.match(sub_value)
|
187
|
+
sub_image_url = sub_dict_entry&.dig('image_url')
|
188
|
+
|
189
|
+
child_children << Service.new(
|
190
|
+
service: sub_key.to_s.capitalize,
|
191
|
+
url: sub_value,
|
192
|
+
image_url: sub_image_url
|
193
|
+
)
|
124
194
|
end
|
125
195
|
end
|
126
196
|
|
@@ -129,7 +199,14 @@ class ServiceFactory
|
|
129
199
|
end
|
130
200
|
# 3.4 If the value is a URL string
|
131
201
|
elsif value.is_a?(String) && UrlChecker.url_like?(value)
|
132
|
-
|
202
|
+
url_dict_entry = dictionary.match(value)
|
203
|
+
url_image_url = url_dict_entry&.dig('image_url')
|
204
|
+
|
205
|
+
child = Service.new(
|
206
|
+
service: key.to_s.capitalize,
|
207
|
+
url: value,
|
208
|
+
image_url: url_image_url
|
209
|
+
)
|
133
210
|
end
|
134
211
|
|
135
212
|
children << child if child
|
@@ -146,7 +223,7 @@ class ServiceFactory
|
|
146
223
|
return Service.new(service: "Unknown", children: children)
|
147
224
|
end
|
148
225
|
in Array
|
149
|
-
|
226
|
+
logger.debug "array: #{data}"
|
150
227
|
|
151
228
|
# Create services from array elements
|
152
229
|
children = data.map { |item| create(item, service_type, dictionary_path) }.compact
|
@@ -169,13 +246,17 @@ class ServiceFactory
|
|
169
246
|
|
170
247
|
# Create service with collected data
|
171
248
|
if slug
|
172
|
-
|
249
|
+
# Получаем URL изображения из записи словаря, если она есть
|
250
|
+
image_url = dict_entry&.dig('image_url')
|
251
|
+
|
252
|
+
# Создаем сервис со всеми данными сразу
|
253
|
+
Service.new(service: slug, url: url, image_url: image_url)
|
173
254
|
else
|
174
255
|
nil
|
175
256
|
end
|
176
257
|
rescue => e
|
177
|
-
|
178
|
-
|
258
|
+
logger.error "Error creating service: #{e.message}"
|
259
|
+
logger.error "Data: #{data.inspect}"
|
179
260
|
return nil
|
180
261
|
end
|
181
262
|
end
|
data/lib/sitedog_parser.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require "sitedog_parser/version"
|
2
2
|
require 'yaml'
|
3
|
+
require 'date'
|
4
|
+
require 'json'
|
3
5
|
|
4
6
|
require_relative "service"
|
5
7
|
require_relative "dictionary"
|
@@ -20,10 +22,12 @@ module SitedogParser
|
|
20
22
|
# @param symbolize_names [Boolean] whether to symbolize keys in the YAML file
|
21
23
|
# @param simple_fields [Array<Symbol>] fields that should remain as simple strings without service wrapping
|
22
24
|
# @param dictionary_path [String, nil] path to the dictionary file (optional)
|
25
|
+
# @param options [Hash] дополнительные опции
|
26
|
+
# @option options [Logger] :logger логгер для вывода сообщений
|
23
27
|
# @return [Hash] hash containing parsed services by type and domain
|
24
|
-
def self.parse_file(file_path, symbolize_names: true, simple_fields: DEFAULT_SIMPLE_FIELDS, dictionary_path: nil)
|
28
|
+
def self.parse_file(file_path, symbolize_names: true, simple_fields: DEFAULT_SIMPLE_FIELDS, dictionary_path: nil, options: {})
|
25
29
|
yaml = YAML.load_file(file_path, symbolize_names: symbolize_names)
|
26
|
-
parse(yaml, simple_fields: simple_fields, dictionary_path: dictionary_path)
|
30
|
+
parse(yaml, simple_fields: simple_fields, dictionary_path: dictionary_path, options: options)
|
27
31
|
end
|
28
32
|
|
29
33
|
# Parse YAML data and convert it to structured Ruby objects
|
@@ -31,22 +35,38 @@ module SitedogParser
|
|
31
35
|
# @param yaml [Hash] YAML data as a hash
|
32
36
|
# @param simple_fields [Array<Symbol>] fields that should remain as simple strings without service wrapping
|
33
37
|
# @param dictionary_path [String, nil] path to the dictionary file (optional)
|
38
|
+
# @param options [Hash] дополнительные опции
|
39
|
+
# @option options [Logger] :logger логгер для вывода сообщений
|
34
40
|
# @return [Hash] hash containing parsed services by type and domain
|
35
|
-
def self.parse(yaml, simple_fields: DEFAULT_SIMPLE_FIELDS, dictionary_path: nil)
|
41
|
+
def self.parse(yaml, simple_fields: DEFAULT_SIMPLE_FIELDS, dictionary_path: nil, options: {})
|
36
42
|
result = {}
|
43
|
+
logger = options[:logger]
|
37
44
|
|
38
45
|
yaml.each do |domain_name, items|
|
39
46
|
services = {}
|
40
47
|
|
41
48
|
# Process each service type and its data
|
42
49
|
items.each do |service_type, data|
|
43
|
-
# Проверяем, является ли это поле "простым"
|
44
|
-
if simple_fields.include?(service_type)
|
45
|
-
#
|
46
|
-
|
50
|
+
# Проверяем, является ли это поле "простым", имеет суффикс _at, или данные - экземпляр DateTime
|
51
|
+
if simple_fields.include?(service_type) || service_type.to_s.end_with?('_at') || data.is_a?(DateTime)
|
52
|
+
# Если данные уже DateTime, сохраняем как есть
|
53
|
+
if data.is_a?(DateTime)
|
54
|
+
services[service_type] = data
|
55
|
+
# Для полей _at пробуем преобразовать строку в DateTime
|
56
|
+
elsif service_type.to_s.end_with?('_at') && data.is_a?(String)
|
57
|
+
begin
|
58
|
+
services[service_type] = DateTime.parse(data)
|
59
|
+
rescue Date::Error
|
60
|
+
# Если не удалось преобразовать, оставляем как строку
|
61
|
+
services[service_type] = data
|
62
|
+
end
|
63
|
+
else
|
64
|
+
# Для обычных простых полей просто сохраняем значение
|
65
|
+
services[service_type] = data
|
66
|
+
end
|
47
67
|
else
|
48
68
|
# Для обычных полей создаем сервис
|
49
|
-
service = ServiceFactory.create(data, service_type, dictionary_path)
|
69
|
+
service = ServiceFactory.create(data, service_type, dictionary_path, options)
|
50
70
|
|
51
71
|
if service
|
52
72
|
services[service_type] ||= []
|
@@ -62,47 +82,51 @@ module SitedogParser
|
|
62
82
|
result
|
63
83
|
end
|
64
84
|
|
65
|
-
#
|
66
|
-
#
|
67
|
-
# @param
|
68
|
-
# @
|
69
|
-
# @return [
|
70
|
-
def self.
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
85
|
+
# Преобразует YAML файл в хеш, где объекты Service преобразуются в хеши
|
86
|
+
# @param file_path [String] путь к YAML файлу
|
87
|
+
# @param options [Hash] дополнительные опции
|
88
|
+
# @option options [Logger] :logger логгер для вывода сообщений
|
89
|
+
# @return [Hash] хеш с сервисами
|
90
|
+
def self.to_hash(file_path, options = {})
|
91
|
+
data = parse_file(file_path, options: options)
|
92
|
+
|
93
|
+
# Преобразуем объекты Service в хеши
|
94
|
+
result = {}
|
95
|
+
|
96
|
+
data.each do |domain, services|
|
97
|
+
domain_key = domain.to_sym # Преобразуем ключи доменов в символы
|
98
|
+
result[domain_key] = {}
|
99
|
+
|
100
|
+
services.each do |service_type, service_data|
|
101
|
+
service_type_key = service_type.to_sym # Преобразуем ключи типов сервисов в символы
|
102
|
+
|
103
|
+
if service_data.is_a?(Array) && service_data.first.is_a?(Service)
|
104
|
+
# Преобразуем массив сервисов в массив хешей
|
105
|
+
result[domain_key][service_type_key] = service_data.map do |service|
|
106
|
+
{
|
107
|
+
'service' => service.service,
|
108
|
+
'url' => service.url,
|
109
|
+
'children' => service.children.map { |child| {'service' => child.service, 'url' => child.url} }
|
110
|
+
}
|
111
|
+
end
|
112
|
+
else
|
113
|
+
# Сохраняем простые поля как есть
|
114
|
+
result[domain_key][service_type_key] = service_data
|
115
|
+
end
|
76
116
|
end
|
77
117
|
end
|
78
118
|
|
79
119
|
result
|
80
120
|
end
|
81
121
|
|
82
|
-
#
|
122
|
+
# Преобразует данные из YAML файла в JSON формат
|
83
123
|
#
|
84
|
-
# @param
|
85
|
-
# @
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
# Get domains with a specific simple field value
|
91
|
-
#
|
92
|
-
# @param parsed_data [Hash] data returned by parse or parse_file
|
93
|
-
# @param field [Symbol] simple field to filter by
|
94
|
-
# @param value [String] value to match
|
95
|
-
# @return [Array] array of domain names that have the specified field value
|
96
|
-
def self.get_domains_by_field_value(parsed_data, field, value)
|
97
|
-
result = []
|
98
|
-
|
99
|
-
parsed_data.each do |domain_name, services|
|
100
|
-
if services[field] == value
|
101
|
-
result << domain_name
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
result
|
124
|
+
# @param file_path [String] путь к YAML файлу
|
125
|
+
# @param options [Hash] дополнительные опции
|
126
|
+
# @option options [Logger] :logger логгер для вывода сообщений
|
127
|
+
# @return [String] форматированная JSON строка
|
128
|
+
def self.to_json(file_path, options = {})
|
129
|
+
JSON.pretty_generate(to_hash(file_path, options))
|
106
130
|
end
|
107
131
|
end
|
108
132
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitedog_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Nemytchenko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-05-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.10.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: minitest-power_assert
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: thor
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -99,13 +113,14 @@ description: A library for parsing and classifying web services, hosting, and do
|
|
99
113
|
email:
|
100
114
|
- nemytchenko@gmail.com
|
101
115
|
executables:
|
102
|
-
-
|
116
|
+
- sitedog_cli
|
103
117
|
extensions: []
|
104
118
|
extra_rdoc_files: []
|
105
119
|
files:
|
106
120
|
- CHANGELOG.md
|
107
121
|
- README.md
|
108
122
|
- bin/analyze_dictionary
|
123
|
+
- bin/sitedog_cli
|
109
124
|
- lib/data_structures.rb
|
110
125
|
- lib/dictionary.rb
|
111
126
|
- lib/dictionary_analyzer.rb
|