ecosystems-bibliothecary 14.2.0 → 14.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Gemfile +1 -0
- data/README.md +1 -1
- data/bin/benchmark +386 -0
- data/lib/bibliothecary/analyser/analysis.rb +10 -5
- data/lib/bibliothecary/analyser/matchers.rb +7 -5
- data/lib/bibliothecary/analyser.rb +1 -0
- data/lib/bibliothecary/file_info.rb +7 -0
- data/lib/bibliothecary/parsers/nuget.rb +1 -1
- data/lib/bibliothecary/runner.rb +170 -10
- data/lib/bibliothecary/version.rb +1 -1
- data/lib/bibliothecary.rb +3 -6
- data/lib/dockerfile_parser.rb +1 -1
- data/lib/modelfile_parser.rb +8 -8
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c529c2cc8f2f35098f5bbca1294b7a36249580d6ef44a735b00257422dfe568c
|
|
4
|
+
data.tar.gz: 2680b1d61e665b2bef8ef5b987f3b58b29559266549e3cd68ccdaa8a65d2c4b0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7518c86a4817297d41c9bbab194901de59a0ca6f07dcad90a9b7d7a78584042d08f9765fc21080475767d399fc6e59f274f8e0132f6673d5e6f7dc2e7bc24908
|
|
7
|
+
data.tar.gz: e916a1d6f2c4fba1f908c8abc0717013ca21d870f7ddea1c0257c579c3de619037e6cf01ab0eb6f57be2c8191482655267b0765020d19cf8d5da7597dbf56ba8
|
data/CHANGELOG.md
CHANGED
|
@@ -13,6 +13,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
13
13
|
|
|
14
14
|
### Removed
|
|
15
15
|
|
|
16
|
+
## [14.3.0]
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- Added `bin/benchmark` script for performance testing.
|
|
21
|
+
|
|
22
|
+
### Changed
|
|
23
|
+
|
|
24
|
+
- Fixed bug where Runner was recreated on every Bibliothecary method call, causing repeated index rebuilding.
|
|
25
|
+
- Memoized package_managers array in Runner.
|
|
26
|
+
- Added filename/extension index for O(1) parser lookup instead of O(n) linear scan through all parsers.
|
|
27
|
+
- Optimized `identify_manifests` to use filename index directly (~139x faster).
|
|
28
|
+
- Optimized `analyse_file` to use filename index for candidate filtering (~16x faster).
|
|
29
|
+
- Added per-file caching of mapping details in FileInfo to avoid repeated lookups.
|
|
30
|
+
- Added `parse_file_info` method to reuse FileInfo objects during parsing.
|
|
31
|
+
|
|
16
32
|
## [14.2.0]
|
|
17
33
|
|
|
18
34
|
### Added
|
data/Gemfile
CHANGED
data/README.md
CHANGED
|
@@ -13,7 +13,7 @@ Requires Ruby 3.4 or above.
|
|
|
13
13
|
Add this line to your application's Gemfile:
|
|
14
14
|
|
|
15
15
|
```ruby
|
|
16
|
-
gem "bibliothecary", git: "https://github.com/ecosyste-ms/bibliothecary.git"
|
|
16
|
+
gem "ecosystems-bibliothecary", git: "https://github.com/ecosyste-ms/bibliothecary.git", require: "bibliothecary"
|
|
17
17
|
```
|
|
18
18
|
|
|
19
19
|
And then execute:
|
data/bin/benchmark
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "bibliothecary"
|
|
6
|
+
require "benchmark"
|
|
7
|
+
require "optparse"
|
|
8
|
+
|
|
9
|
+
class InfrastructureBenchmark
|
|
10
|
+
FIXTURES_DIR = File.expand_path("../spec/fixtures", __dir__)
|
|
11
|
+
|
|
12
|
+
def initialize(iterations: 100)
|
|
13
|
+
@iterations = iterations
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def run
|
|
17
|
+
puts "Infrastructure Benchmark"
|
|
18
|
+
puts "=" * 60
|
|
19
|
+
puts "Iterations: #{@iterations}"
|
|
20
|
+
puts
|
|
21
|
+
|
|
22
|
+
# Prepare test data - mix of different file types
|
|
23
|
+
test_files = prepare_test_files
|
|
24
|
+
puts "Test files: #{test_files.length}"
|
|
25
|
+
puts
|
|
26
|
+
|
|
27
|
+
benchmark_package_managers
|
|
28
|
+
benchmark_matching_overhead
|
|
29
|
+
benchmark_load_file_info_list_from_contents(test_files)
|
|
30
|
+
benchmark_analyse_file(test_files)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def prepare_test_files
|
|
34
|
+
files = []
|
|
35
|
+
# Get a representative sample of fixtures
|
|
36
|
+
%w[package.json Gemfile.lock Cargo.toml pom.xml requirements.txt go.mod].each do |name|
|
|
37
|
+
path = Dir.glob("#{FIXTURES_DIR}/**/#{name}").first
|
|
38
|
+
next unless path
|
|
39
|
+
|
|
40
|
+
files << { file_path: path.sub("#{FIXTURES_DIR}/", ""), contents: File.read(path) }
|
|
41
|
+
end
|
|
42
|
+
files
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def benchmark_package_managers
|
|
46
|
+
puts "package_managers method:"
|
|
47
|
+
puts "-" * 40
|
|
48
|
+
|
|
49
|
+
runner = Bibliothecary.runner
|
|
50
|
+
|
|
51
|
+
# Warm up
|
|
52
|
+
5.times { runner.package_managers }
|
|
53
|
+
|
|
54
|
+
time = Benchmark.measure do
|
|
55
|
+
@iterations.times { runner.package_managers }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
printf " %d calls: %.3f ms total, %.4f ms/call\n",
|
|
59
|
+
@iterations, time.real * 1000, (time.real / @iterations) * 1000
|
|
60
|
+
puts
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def benchmark_load_file_info_list_from_contents(test_files)
|
|
64
|
+
puts "load_file_info_list_from_contents:"
|
|
65
|
+
puts "-" * 40
|
|
66
|
+
|
|
67
|
+
runner = Bibliothecary.runner
|
|
68
|
+
|
|
69
|
+
# Warm up
|
|
70
|
+
5.times { runner.load_file_info_list_from_contents(test_files) }
|
|
71
|
+
|
|
72
|
+
time = Benchmark.measure do
|
|
73
|
+
@iterations.times { runner.load_file_info_list_from_contents(test_files) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
total_files = test_files.length * @iterations
|
|
77
|
+
printf " %d calls (%d files each): %.3f ms total\n",
|
|
78
|
+
@iterations, test_files.length, time.real * 1000
|
|
79
|
+
printf " %.4f ms/call, %.4f ms/file\n",
|
|
80
|
+
(time.real / @iterations) * 1000,
|
|
81
|
+
(time.real / total_files) * 1000
|
|
82
|
+
puts
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def benchmark_analyse_file(test_files)
|
|
86
|
+
puts "analyse_file (full pipeline):"
|
|
87
|
+
puts "-" * 40
|
|
88
|
+
|
|
89
|
+
runner = Bibliothecary.runner
|
|
90
|
+
|
|
91
|
+
test_files.each do |file|
|
|
92
|
+
# Warm up
|
|
93
|
+
3.times { runner.analyse_file(file[:file_path], file[:contents]) }
|
|
94
|
+
|
|
95
|
+
time = Benchmark.measure do
|
|
96
|
+
@iterations.times { runner.analyse_file(file[:file_path], file[:contents]) }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
printf " %-30s %.4f ms/call\n",
|
|
100
|
+
File.basename(file[:file_path]),
|
|
101
|
+
(time.real / @iterations) * 1000
|
|
102
|
+
end
|
|
103
|
+
puts
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def benchmark_matching_overhead
|
|
107
|
+
puts "Matching overhead breakdown:"
|
|
108
|
+
puts "-" * 40
|
|
109
|
+
|
|
110
|
+
runner = Bibliothecary.runner
|
|
111
|
+
pms = runner.package_managers
|
|
112
|
+
|
|
113
|
+
# Test with a simple package.json
|
|
114
|
+
test_file = { file_path: "package.json", contents: '{"dependencies":{}}' }
|
|
115
|
+
info = Bibliothecary::FileInfo.new(nil, test_file[:file_path], test_file[:contents])
|
|
116
|
+
|
|
117
|
+
# Benchmark match_info? across all parsers
|
|
118
|
+
time = Benchmark.measure do
|
|
119
|
+
@iterations.times do
|
|
120
|
+
pms.each { |pm| pm.match_info?(info) }
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
printf " match_info? x %d parsers: %.4f ms/file\n",
|
|
124
|
+
pms.length, (time.real / @iterations) * 1000
|
|
125
|
+
|
|
126
|
+
# Benchmark just the npm parser's match_info?
|
|
127
|
+
npm = pms.find { |pm| pm.platform_name == "npm" }
|
|
128
|
+
time = Benchmark.measure do
|
|
129
|
+
@iterations.times { npm.match_info?(info) }
|
|
130
|
+
end
|
|
131
|
+
printf " npm.match_info? alone: %.4f ms/call\n",
|
|
132
|
+
(time.real / @iterations) * 1000
|
|
133
|
+
|
|
134
|
+
# Benchmark first_matching_mapping_details (called multiple times per file)
|
|
135
|
+
time = Benchmark.measure do
|
|
136
|
+
@iterations.times do
|
|
137
|
+
npm.send(:first_matching_mapping_details, info)
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
printf " first_matching_mapping_details: %.4f ms/call\n",
|
|
141
|
+
(time.real / @iterations) * 1000
|
|
142
|
+
|
|
143
|
+
puts
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
class ParserBenchmark
|
|
148
|
+
FIXTURES_DIR = File.expand_path("../spec/fixtures", __dir__)
|
|
149
|
+
|
|
150
|
+
# Parser methods that require remote services
|
|
151
|
+
REMOTE_PARSERS = {
|
|
152
|
+
"swiftpm" => [:parse_package_swift],
|
|
153
|
+
"hackage" => [:parse_cabal],
|
|
154
|
+
"hex" => [:parse_mix, :parse_mix_lock],
|
|
155
|
+
"carthage" => [:parse_cartfile, :parse_cartfile_private, :parse_cartfile_resolved],
|
|
156
|
+
"clojars" => [:parse_manifest],
|
|
157
|
+
}.freeze
|
|
158
|
+
|
|
159
|
+
# Multi-parser methods shared across many package managers
|
|
160
|
+
MULTI_PARSER_METHODS = %i[
|
|
161
|
+
parse_cyclonedx_json
|
|
162
|
+
parse_cyclonedx_xml
|
|
163
|
+
parse_spdx_json
|
|
164
|
+
parse_spdx_tag_value
|
|
165
|
+
parse_dependencies_csv
|
|
166
|
+
].freeze
|
|
167
|
+
|
|
168
|
+
def initialize(options = {})
|
|
169
|
+
@iterations = options.fetch(:iterations, 100)
|
|
170
|
+
@parser_filter = options[:parser]
|
|
171
|
+
@verbose = options[:verbose]
|
|
172
|
+
@native_only = options[:native_only]
|
|
173
|
+
@results = {}
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def run
|
|
177
|
+
puts "Bibliothecary Parser Benchmark"
|
|
178
|
+
puts "=" * 60
|
|
179
|
+
puts "Iterations per file: #{@iterations}"
|
|
180
|
+
puts "Fixtures directory: #{FIXTURES_DIR}"
|
|
181
|
+
puts
|
|
182
|
+
|
|
183
|
+
parsers = filtered_parsers
|
|
184
|
+
puts "Running benchmarks for #{parsers.length} parser(s)..."
|
|
185
|
+
puts
|
|
186
|
+
|
|
187
|
+
parsers.each do |parser|
|
|
188
|
+
benchmark_parser(parser)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
print_summary
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def filtered_parsers
|
|
195
|
+
all_parsers = Bibliothecary.package_managers
|
|
196
|
+
return all_parsers unless @parser_filter
|
|
197
|
+
|
|
198
|
+
matching = all_parsers.select do |pm|
|
|
199
|
+
pm.platform_name.downcase.include?(@parser_filter.downcase)
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
if matching.empty?
|
|
203
|
+
puts "No parser matching '#{@parser_filter}' found."
|
|
204
|
+
puts "Available parsers: #{all_parsers.map(&:platform_name).join(', ')}"
|
|
205
|
+
exit 1
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
matching
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def benchmark_parser(parser)
|
|
212
|
+
platform = parser.platform_name
|
|
213
|
+
mapping = parser.mapping
|
|
214
|
+
fixtures = find_fixtures_for_parser(mapping, platform)
|
|
215
|
+
|
|
216
|
+
if fixtures.empty?
|
|
217
|
+
puts "#{platform}: no matching fixtures found"
|
|
218
|
+
puts if @verbose
|
|
219
|
+
return
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
puts "#{platform} (#{fixtures.length} files)"
|
|
223
|
+
puts "-" * 40
|
|
224
|
+
|
|
225
|
+
parser_total = 0
|
|
226
|
+
file_results = []
|
|
227
|
+
|
|
228
|
+
fixtures.each do |fixture_path, mapping_entry|
|
|
229
|
+
contents = File.read(fixture_path)
|
|
230
|
+
filename = File.basename(fixture_path)
|
|
231
|
+
relative = fixture_path.sub("#{FIXTURES_DIR}/", "")
|
|
232
|
+
|
|
233
|
+
begin
|
|
234
|
+
time = Benchmark.measure do
|
|
235
|
+
@iterations.times do
|
|
236
|
+
parser.send(mapping_entry[:parser], contents, options: { filename: filename })
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
avg_ms = (time.real / @iterations) * 1000
|
|
241
|
+
parser_total += time.real
|
|
242
|
+
|
|
243
|
+
file_results << {
|
|
244
|
+
file: relative,
|
|
245
|
+
total: time.real,
|
|
246
|
+
avg_ms: avg_ms,
|
|
247
|
+
kind: mapping_entry[:kind],
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if @verbose
|
|
251
|
+
printf " %-40s %8.3f ms/call (%s)\n", relative, avg_ms, mapping_entry[:kind]
|
|
252
|
+
end
|
|
253
|
+
rescue Bibliothecary::RemoteParsingError => e
|
|
254
|
+
puts " #{relative}: skipped (remote parser unavailable)"
|
|
255
|
+
rescue => e
|
|
256
|
+
puts " #{relative}: error - #{e.class}: #{e.message}"
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
unless @verbose
|
|
261
|
+
file_results.sort_by { |r| -r[:avg_ms] }.first(3).each do |r|
|
|
262
|
+
printf " %-40s %8.3f ms/call\n", r[:file], r[:avg_ms]
|
|
263
|
+
end
|
|
264
|
+
puts " ..." if file_results.length > 3
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
avg_total = (parser_total / fixtures.length / @iterations) * 1000
|
|
268
|
+
printf " Total: %.3f ms avg per file\n", avg_total
|
|
269
|
+
puts
|
|
270
|
+
|
|
271
|
+
@results[platform] = {
|
|
272
|
+
files: fixtures.length,
|
|
273
|
+
total_time: parser_total,
|
|
274
|
+
avg_per_file: avg_total,
|
|
275
|
+
file_results: file_results,
|
|
276
|
+
}
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def find_fixtures_for_parser(mapping, platform_name)
|
|
280
|
+
fixtures = []
|
|
281
|
+
remote_methods = REMOTE_PARSERS[platform_name] || []
|
|
282
|
+
|
|
283
|
+
all_fixtures.each do |fixture_path|
|
|
284
|
+
filename = File.basename(fixture_path)
|
|
285
|
+
relative_path = fixture_path.sub("#{FIXTURES_DIR}/", "")
|
|
286
|
+
|
|
287
|
+
mapping.each do |matcher, entry|
|
|
288
|
+
next unless entry[:parser]
|
|
289
|
+
next if remote_methods.include?(entry[:parser])
|
|
290
|
+
next if @native_only && MULTI_PARSER_METHODS.include?(entry[:parser])
|
|
291
|
+
|
|
292
|
+
if matcher_matches?(matcher, filename, fixture_path)
|
|
293
|
+
fixtures << [fixture_path, entry]
|
|
294
|
+
break
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
fixtures
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def matcher_matches?(matcher, filename, full_path)
|
|
303
|
+
relative_path = full_path.sub("#{FIXTURES_DIR}/", "")
|
|
304
|
+
|
|
305
|
+
case matcher
|
|
306
|
+
when Regexp
|
|
307
|
+
filename.match?(matcher)
|
|
308
|
+
when String
|
|
309
|
+
filename == matcher
|
|
310
|
+
when Proc
|
|
311
|
+
matcher.call(relative_path)
|
|
312
|
+
else
|
|
313
|
+
false
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def all_fixtures
|
|
318
|
+
@all_fixtures ||= Dir.glob("#{FIXTURES_DIR}/**/*")
|
|
319
|
+
.select { |f| File.file?(f) }
|
|
320
|
+
.reject { |f| f.include?("/broken/") }
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def print_summary
|
|
324
|
+
return if @results.empty?
|
|
325
|
+
|
|
326
|
+
puts "=" * 60
|
|
327
|
+
puts "Summary (sorted by avg time per file)"
|
|
328
|
+
puts "=" * 60
|
|
329
|
+
|
|
330
|
+
sorted = @results.sort_by { |_, v| -v[:avg_per_file] }
|
|
331
|
+
|
|
332
|
+
printf "%-20s %10s %12s\n", "Parser", "Files", "Avg ms/file"
|
|
333
|
+
printf "%-20s %10s %12s\n", "-" * 20, "-" * 10, "-" * 12
|
|
334
|
+
|
|
335
|
+
sorted.each do |platform, data|
|
|
336
|
+
printf "%-20s %10d %12.3f\n", platform, data[:files], data[:avg_per_file]
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
puts
|
|
340
|
+
total_files = @results.values.sum { |v| v[:files] }
|
|
341
|
+
total_time = @results.values.sum { |v| v[:total_time] }
|
|
342
|
+
puts "Total: #{total_files} files, #{(total_time * 1000).round(1)} ms total time"
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
options = {
|
|
347
|
+
iterations: 100,
|
|
348
|
+
verbose: false,
|
|
349
|
+
mode: :parsers,
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
OptionParser.new do |opts|
|
|
353
|
+
opts.banner = "Usage: bin/benchmark [options]"
|
|
354
|
+
|
|
355
|
+
opts.on("-p", "--parser NAME", "Only benchmark parsers matching NAME") do |p|
|
|
356
|
+
options[:parser] = p
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
opts.on("-n", "--iterations N", Integer, "Number of iterations per file (default: 100)") do |n|
|
|
360
|
+
options[:iterations] = n
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
opts.on("-v", "--verbose", "Show all files, not just slowest") do
|
|
364
|
+
options[:verbose] = true
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
opts.on("--native-only", "Exclude shared multi-parsers (CycloneDX, SPDX, CSV)") do
|
|
368
|
+
options[:native_only] = true
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
opts.on("--infra", "Benchmark infrastructure (load_file_info_list, etc)") do
|
|
372
|
+
options[:mode] = :infra
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
opts.on("-h", "--help", "Show this help") do
|
|
376
|
+
puts opts
|
|
377
|
+
exit
|
|
378
|
+
end
|
|
379
|
+
end.parse!
|
|
380
|
+
|
|
381
|
+
case options[:mode]
|
|
382
|
+
when :infra
|
|
383
|
+
InfrastructureBenchmark.new(iterations: options[:iterations]).run
|
|
384
|
+
else
|
|
385
|
+
ParserBenchmark.new(options).run
|
|
386
|
+
end
|
|
@@ -40,7 +40,7 @@ module Bibliothecary
|
|
|
40
40
|
# If your Parser needs to return multiple responses for one file, please override this method
|
|
41
41
|
# For example see conda.rb
|
|
42
42
|
kind = determine_kind_from_info(info)
|
|
43
|
-
parser_result =
|
|
43
|
+
parser_result = parse_file_info(info, options: options)
|
|
44
44
|
parser_result = ParserResult.new(dependencies: []) if parser_result.nil? # work around any legacy parsers that return nil
|
|
45
45
|
|
|
46
46
|
Bibliothecary::Analyser.create_analysis(platform_name, info.relative_path, kind, parser_result)
|
|
@@ -52,26 +52,31 @@ module Bibliothecary
|
|
|
52
52
|
# Call the matching parse class method for this file with
|
|
53
53
|
# these contents
|
|
54
54
|
def parse_file(filename, contents, options: {})
|
|
55
|
-
|
|
55
|
+
parse_file_info(FileInfo.new(nil, filename, contents), options: options)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Parse a file using its FileInfo object, reusing cached mapping details.
|
|
59
|
+
def parse_file_info(info, options: {})
|
|
60
|
+
details = first_matching_mapping_details(info)
|
|
56
61
|
|
|
57
62
|
# this can be raised if we don't check match?/match_info?,
|
|
58
63
|
# OR don't have the file contents when we check them, so
|
|
59
64
|
# it turns out for example that a .xml file isn't a
|
|
60
65
|
# manifest after all.
|
|
61
|
-
raise Bibliothecary::FileParsingError.new("No parser for this file type",
|
|
66
|
+
raise Bibliothecary::FileParsingError.new("No parser for this file type", info.relative_path) unless details[:parser]
|
|
62
67
|
|
|
63
68
|
# The `parser` method should raise an exception if the file is malformed,
|
|
64
69
|
# should return empty [] if the file is fine but simply doesn't contain
|
|
65
70
|
# any dependencies, and should never return nil. At the time of writing
|
|
66
71
|
# this comment, some of the parsers return [] or nil to mean an error
|
|
67
72
|
# which is confusing to users.
|
|
68
|
-
send(details[:parser], contents, options: options.merge(filename:
|
|
73
|
+
send(details[:parser], info.contents, options: options.merge(filename: info.relative_path))
|
|
69
74
|
rescue Exception => e # default is StandardError but C bindings throw Exceptions # rubocop:disable Lint/RescueException
|
|
70
75
|
# the C xml parser also puts a newline at the end of the message
|
|
71
76
|
location = e.backtrace_locations[0]
|
|
72
77
|
.to_s
|
|
73
78
|
.then { |l| l =~ /bibliothecary\// ? l.split("bibliothecary/").last : l.split("gems/").last }
|
|
74
|
-
raise Bibliothecary::FileParsingError.new(e.message.strip,
|
|
79
|
+
raise Bibliothecary::FileParsingError.new(e.message.strip, info.relative_path, location)
|
|
75
80
|
end
|
|
76
81
|
|
|
77
82
|
private
|
|
@@ -52,12 +52,14 @@ module Bibliothecary
|
|
|
52
52
|
first_matching_mapping_details(info).any?
|
|
53
53
|
end
|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
# Get mapping details for this file, using cache if available.
|
|
56
|
+
# The cache is stored on the FileInfo object to avoid repeated lookups.
|
|
57
57
|
def first_matching_mapping_details(info)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
info.cached_mapping_details(self) do
|
|
59
|
+
mapping
|
|
60
|
+
.find { |matcher, details| mapping_entry_match?(matcher, details, info) }
|
|
61
|
+
&.last || {}
|
|
62
|
+
end
|
|
61
63
|
end
|
|
62
64
|
end
|
|
63
65
|
end
|
|
@@ -46,10 +46,17 @@ module Bibliothecary
|
|
|
46
46
|
@contents = contents
|
|
47
47
|
|
|
48
48
|
@package_manager = nil
|
|
49
|
+
@mapping_cache = {}
|
|
49
50
|
end
|
|
50
51
|
|
|
51
52
|
def groupable?
|
|
52
53
|
@package_manager&.groupable?(self)
|
|
53
54
|
end
|
|
55
|
+
|
|
56
|
+
# Cache and retrieve mapping details for a given package manager class.
|
|
57
|
+
# This avoids repeatedly calling first_matching_mapping_details.
|
|
58
|
+
def cached_mapping_details(package_manager_class)
|
|
59
|
+
@mapping_cache[package_manager_class] ||= yield
|
|
60
|
+
end
|
|
54
61
|
end
|
|
55
62
|
end
|
|
@@ -201,7 +201,7 @@ module Bibliothecary
|
|
|
201
201
|
|
|
202
202
|
def self.parse_paket_lock(file_contents, options: {})
|
|
203
203
|
lines = file_contents.split("\n")
|
|
204
|
-
package_version_re = /\s+(?<name>\S+)\s\((?<version>\d
|
|
204
|
+
package_version_re = /\s+(?<name>\S+)\s\((?<version>\d+(?:\.\d+)+)\)/
|
|
205
205
|
packages = lines.select { |line| package_version_re.match(line) }.map { |line| package_version_re.match(line) }.map do |match|
|
|
206
206
|
Dependency.new(
|
|
207
207
|
name: match[:name].strip,
|
data/lib/bibliothecary/runner.rb
CHANGED
|
@@ -40,12 +40,160 @@ module Bibliothecary
|
|
|
40
40
|
end
|
|
41
41
|
|
|
42
42
|
def applicable_package_managers(info)
|
|
43
|
-
|
|
43
|
+
candidates = candidate_package_managers(info.relative_path)
|
|
44
|
+
managers = candidates.select { |pm| pm.match_info?(info) }
|
|
44
45
|
managers.empty? ? [nil] : managers
|
|
45
46
|
end
|
|
46
47
|
|
|
47
48
|
def package_managers
|
|
48
|
-
|
|
49
|
+
@package_managers ||= Bibliothecary::Parsers.constants
|
|
50
|
+
.map { |c| Bibliothecary::Parsers.const_get(c) }
|
|
51
|
+
.sort_by { |c| c.to_s.downcase }
|
|
52
|
+
.freeze
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Get candidate package managers for a file path using filename/extension index.
|
|
56
|
+
# Falls back to all package managers for unindexed patterns.
|
|
57
|
+
def candidate_package_managers(path)
|
|
58
|
+
filename = File.basename(path)
|
|
59
|
+
filename_lower = filename.downcase
|
|
60
|
+
|
|
61
|
+
# Check exact filename match first (use fetch to avoid default block on frozen hash)
|
|
62
|
+
candidates = filename_index.fetch(filename_lower, nil)
|
|
63
|
+
return candidates if candidates
|
|
64
|
+
|
|
65
|
+
# Check extension matches
|
|
66
|
+
extension_index.each do |ext, ext_candidates|
|
|
67
|
+
return ext_candidates if filename_lower.end_with?(ext)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Fall back to all package managers for unindexed patterns
|
|
71
|
+
package_managers
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Build an index mapping lowercase filenames to candidate parsers
|
|
75
|
+
def filename_index
|
|
76
|
+
@filename_index ||= build_filename_index
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Build an index mapping lowercase extensions to candidate parsers
|
|
80
|
+
def extension_index
|
|
81
|
+
@extension_index ||= build_extension_index
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def build_filename_index
|
|
85
|
+
index = {}
|
|
86
|
+
|
|
87
|
+
package_managers.each do |pm|
|
|
88
|
+
pm.mapping.each_key do |matcher|
|
|
89
|
+
next unless matcher.is_a?(Proc)
|
|
90
|
+
|
|
91
|
+
# Extract filenames from the matcher by testing common patterns
|
|
92
|
+
extract_filenames_from_matcher(matcher).each do |filename|
|
|
93
|
+
key = filename.downcase
|
|
94
|
+
index[key] ||= []
|
|
95
|
+
index[key] << pm
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Deduplicate and freeze
|
|
101
|
+
index.transform_values! { |v| v.uniq.freeze }
|
|
102
|
+
index.freeze
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def build_extension_index
|
|
106
|
+
index = {}
|
|
107
|
+
|
|
108
|
+
package_managers.each do |pm|
|
|
109
|
+
pm.mapping.each_key do |matcher|
|
|
110
|
+
next unless matcher.is_a?(Proc)
|
|
111
|
+
|
|
112
|
+
# Extract extensions from the matcher
|
|
113
|
+
extract_extensions_from_matcher(matcher).each do |ext|
|
|
114
|
+
key = ext.downcase
|
|
115
|
+
index[key] ||= []
|
|
116
|
+
index[key] << pm
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Deduplicate and freeze
|
|
122
|
+
index.transform_values! { |v| v.uniq.freeze }
|
|
123
|
+
index.freeze
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Try to extract filename patterns from a matcher proc
|
|
127
|
+
def extract_filenames_from_matcher(matcher)
|
|
128
|
+
filenames = []
|
|
129
|
+
|
|
130
|
+
# Test common manifest filenames to see which ones match
|
|
131
|
+
common_filenames.each do |filename|
|
|
132
|
+
filenames << filename if matcher.call(filename)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
filenames
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Try to extract extension patterns from a matcher proc
|
|
139
|
+
def extract_extensions_from_matcher(matcher)
|
|
140
|
+
extensions = []
|
|
141
|
+
|
|
142
|
+
# Test common extensions
|
|
143
|
+
common_extensions.each do |ext|
|
|
144
|
+
test_file = "test#{ext}"
|
|
145
|
+
extensions << ext if matcher.call(test_file)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
extensions
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def common_filenames
|
|
152
|
+
@common_filenames ||= %w[
|
|
153
|
+
package.json package-lock.json yarn.lock pnpm-lock.yaml npm-shrinkwrap.json npm-ls.json bun.lock
|
|
154
|
+
Gemfile Gemfile.lock gems.rb gems.locked
|
|
155
|
+
Cargo.toml Cargo.lock
|
|
156
|
+
go.mod go.sum Gopkg.toml Gopkg.lock glide.yaml glide.lock Godeps
|
|
157
|
+
requirements.txt Pipfile Pipfile.lock pyproject.toml poetry.lock setup.py
|
|
158
|
+
pom.xml build.gradle build.gradle.kts ivy.xml
|
|
159
|
+
composer.json composer.lock
|
|
160
|
+
Podfile Podfile.lock
|
|
161
|
+
pubspec.yaml pubspec.lock
|
|
162
|
+
Package.swift Package.resolved
|
|
163
|
+
Cartfile Cartfile.resolved Cartfile.private
|
|
164
|
+
mix.exs mix.lock
|
|
165
|
+
project.clj
|
|
166
|
+
shard.yml shard.lock
|
|
167
|
+
environment.yml environment.yaml
|
|
168
|
+
bower.json
|
|
169
|
+
elm-package.json elm.json
|
|
170
|
+
vcpkg.json
|
|
171
|
+
dub.json dub.sdl
|
|
172
|
+
haxelib.json
|
|
173
|
+
action.yml action.yaml
|
|
174
|
+
Brewfile Brewfile.lock.json
|
|
175
|
+
REQUIRE Project.toml Manifest.toml
|
|
176
|
+
paket.lock packages.config Project.json Project.lock.json packages.lock.json project.assets.json
|
|
177
|
+
DESCRIPTION
|
|
178
|
+
META.json META.yml cpanfile
|
|
179
|
+
cabal.config
|
|
180
|
+
cyclonedx.json cyclonedx.xml
|
|
181
|
+
dependencies.csv
|
|
182
|
+
docker-compose.yml docker-compose.yaml Dockerfile
|
|
183
|
+
MLmodel
|
|
184
|
+
Modelfile
|
|
185
|
+
dvc.yaml
|
|
186
|
+
cog.yaml
|
|
187
|
+
bentofile.yaml
|
|
188
|
+
uv.lock pylock.toml
|
|
189
|
+
].freeze
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def common_extensions
|
|
193
|
+
@common_extensions ||= %w[
|
|
194
|
+
.gemspec .nuspec .csproj .cabal .podspec .podspec.json
|
|
195
|
+
.spdx .cdx.json .cdx.xml
|
|
196
|
+
].freeze
|
|
49
197
|
end
|
|
50
198
|
|
|
51
199
|
# Parses an array of format [{file_path: "", contents: ""},] to match
|
|
@@ -120,7 +268,9 @@ module Bibliothecary
|
|
|
120
268
|
def analyse_file(file_path, contents)
|
|
121
269
|
contents = Bibliothecary.utf8_string(contents)
|
|
122
270
|
|
|
123
|
-
|
|
271
|
+
# Use filename index to quickly find candidate parsers
|
|
272
|
+
candidates = candidate_package_managers(file_path)
|
|
273
|
+
candidates.select { |pm| pm.match?(file_path, contents) }.map do |pm|
|
|
124
274
|
pm.analyse_contents(file_path, contents, options: @options)
|
|
125
275
|
end.flatten.uniq.compact
|
|
126
276
|
end
|
|
@@ -137,14 +287,24 @@ module Bibliothecary
|
|
|
137
287
|
ignored_dirs.include?(f) || f.start_with?(*ignored_dirs_with_slash)
|
|
138
288
|
end
|
|
139
289
|
allowed_file_list = allowed_file_list.reject { |f| ignored_files.include?(f) }
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
290
|
+
|
|
291
|
+
# Fast path: use filename index directly for known manifest filenames
|
|
292
|
+
# This avoids creating FileInfo objects and calling match? for each file
|
|
293
|
+
manifests = []
|
|
294
|
+
allowed_file_list.each do |file_path|
|
|
295
|
+
filename_lower = File.basename(file_path).downcase
|
|
296
|
+
|
|
297
|
+
# Check if this filename is in our index (known manifest)
|
|
298
|
+
if filename_index.key?(filename_lower)
|
|
299
|
+
manifests << file_path
|
|
300
|
+
next
|
|
146
301
|
end
|
|
147
|
-
|
|
302
|
+
|
|
303
|
+
# Check extension index
|
|
304
|
+
matched = extension_index.keys.any? { |ext| filename_lower.end_with?(ext) }
|
|
305
|
+
manifests << file_path if matched
|
|
306
|
+
end
|
|
307
|
+
manifests.sort
|
|
148
308
|
end
|
|
149
309
|
|
|
150
310
|
def ignored_dirs
|
data/lib/bibliothecary.rb
CHANGED
|
@@ -100,19 +100,16 @@ module Bibliothecary
|
|
|
100
100
|
end
|
|
101
101
|
|
|
102
102
|
def self.runner
|
|
103
|
-
configuration
|
|
104
|
-
@runner
|
|
103
|
+
@runner ||= Runner.new(configuration)
|
|
105
104
|
end
|
|
106
105
|
|
|
107
106
|
def self.configuration
|
|
108
107
|
@configuration ||= Configuration.new
|
|
109
|
-
@runner = Runner.new(@configuration)
|
|
110
|
-
@configuration
|
|
111
108
|
end
|
|
112
109
|
|
|
113
110
|
def self.reset
|
|
114
|
-
@configuration =
|
|
115
|
-
@runner =
|
|
111
|
+
@configuration = nil
|
|
112
|
+
@runner = nil
|
|
116
113
|
end
|
|
117
114
|
|
|
118
115
|
def self.configure
|
data/lib/dockerfile_parser.rb
CHANGED
|
@@ -4,7 +4,7 @@ class DockerfileParser
|
|
|
4
4
|
end
|
|
5
5
|
|
|
6
6
|
def parse
|
|
7
|
-
fromlines = @file_contents.split("\n").select { |line| line.strip =~
|
|
7
|
+
fromlines = @file_contents.split("\n").select { |line| line.strip =~ /^FROM/i }
|
|
8
8
|
|
|
9
9
|
fromlines.map do |line|
|
|
10
10
|
line = line.strip.split(' ')
|
data/lib/modelfile_parser.rb
CHANGED
|
@@ -4,10 +4,10 @@ class ModelfileParser
|
|
|
4
4
|
end
|
|
5
5
|
|
|
6
6
|
def parse
|
|
7
|
-
fromlines = @file_contents.split("\n").select { |line| line.strip =~
|
|
7
|
+
fromlines = @file_contents.split("\n").select { |line| line.strip =~ /^FROM/i }
|
|
8
8
|
|
|
9
9
|
fromlines.map do |line|
|
|
10
|
-
line = line.strip.split
|
|
10
|
+
line = line.strip.split
|
|
11
11
|
|
|
12
12
|
# Remove the FROM keyword
|
|
13
13
|
line.shift
|
|
@@ -23,19 +23,19 @@ class ModelfileParser
|
|
|
23
23
|
model_ref = line[0]
|
|
24
24
|
|
|
25
25
|
# Check if it's a file path (local GGUF or directory)
|
|
26
|
-
if model_ref =~ /\.(gguf|safetensors)$/i || model_ref.start_with?(
|
|
26
|
+
if model_ref =~ /\.(gguf|safetensors)$/i || model_ref.start_with?("./", "/")
|
|
27
27
|
{
|
|
28
28
|
name: File.basename(model_ref),
|
|
29
|
-
requirement:
|
|
30
|
-
type:
|
|
29
|
+
requirement: "local",
|
|
30
|
+
type: "runtime",
|
|
31
31
|
}
|
|
32
32
|
else
|
|
33
33
|
# It's a registry model (e.g., llama3.2 or llama3.2:latest)
|
|
34
|
-
parts = model_ref.split(
|
|
34
|
+
parts = model_ref.split(":")
|
|
35
35
|
{
|
|
36
36
|
name: parts[0],
|
|
37
|
-
requirement: parts[1] ||
|
|
38
|
-
type:
|
|
37
|
+
requirement: parts[1] || "latest",
|
|
38
|
+
type: "runtime",
|
|
39
39
|
}
|
|
40
40
|
end
|
|
41
41
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ecosystems-bibliothecary
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 14.
|
|
4
|
+
version: 14.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Nesbitt
|
|
@@ -180,6 +180,7 @@ dependencies:
|
|
|
180
180
|
email:
|
|
181
181
|
- andrewnez@gmail.com
|
|
182
182
|
executables:
|
|
183
|
+
- benchmark
|
|
183
184
|
- bibliothecary
|
|
184
185
|
- console
|
|
185
186
|
- setup
|
|
@@ -201,6 +202,7 @@ files:
|
|
|
201
202
|
- README.md
|
|
202
203
|
- Rakefile
|
|
203
204
|
- bibliothecary.gemspec
|
|
205
|
+
- bin/benchmark
|
|
204
206
|
- bin/bibliothecary
|
|
205
207
|
- bin/console
|
|
206
208
|
- bin/setup
|