ecosystems-bibliothecary 14.2.0 → 14.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c531c54aa377c8bc30d1a2f75e3de0bbad1a0502568976f3d487fe3c4c78bc53
4
- data.tar.gz: 82b7ca70158bc5ce1094af762eed9b7cb20fa6492c807663c965266bc8ce535a
3
+ metadata.gz: c529c2cc8f2f35098f5bbca1294b7a36249580d6ef44a735b00257422dfe568c
4
+ data.tar.gz: 2680b1d61e665b2bef8ef5b987f3b58b29559266549e3cd68ccdaa8a65d2c4b0
5
5
  SHA512:
6
- metadata.gz: a981fd824d3227d00b9a937199ab3eeb007139c7f76c28de5a8681e1a680948cfe453abab0a791ea4b65c56d6f2d22943b7a834b6c83c3df36de746504bb2c1d
7
- data.tar.gz: 35ec260ba3a5a92a84a5db142771681eb609feb1a65288471f1152f29838d118484f7e1ad9f0e6ff021547b322a8dc0a384a53bf5939e55ee6b4430f4224bc24
6
+ metadata.gz: 7518c86a4817297d41c9bbab194901de59a0ca6f07dcad90a9b7d7a78584042d08f9765fc21080475767d399fc6e59f274f8e0132f6673d5e6f7dc2e7bc24908
7
+ data.tar.gz: e916a1d6f2c4fba1f908c8abc0717013ca21d870f7ddea1c0257c579c3de619037e6cf01ab0eb6f57be2c8191482655267b0765020d19cf8d5da7597dbf56ba8
data/CHANGELOG.md CHANGED
@@ -13,6 +13,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
13
13
 
14
14
  ### Removed
15
15
 
16
+ ## [14.3.0]
17
+
18
+ ### Added
19
+
20
+ - Added `bin/benchmark` script for performance testing.
21
+
22
+ ### Changed
23
+
24
+ - Fixed bug where Runner was recreated on every Bibliothecary method call, causing repeated index rebuilding.
25
+ - Memoized package_managers array in Runner.
26
+ - Added filename/extension index for O(1) parser lookup instead of O(n) linear scan through all parsers.
27
+ - Optimized `identify_manifests` to use filename index directly (~139x faster).
28
+ - Optimized `analyse_file` to use filename index for candidate filtering (~16x faster).
29
+ - Added per-file caching of mapping details in FileInfo to avoid repeated lookups.
30
+ - Added `parse_file_info` method to reuse FileInfo objects during parsing.
31
+
16
32
  ## [14.2.0]
17
33
 
18
34
  ### Added
data/Gemfile CHANGED
@@ -7,6 +7,7 @@ gem "strings-ansi", ref: "35d0c9430cf0a8022dc12bdab005bce296cb9f00", github: "pi
7
7
 
8
8
  # Ruby 3.4+ no longer includes these as default gems
9
9
  gem "base64"
10
+ gem "benchmark"
10
11
  gem "bigdecimal"
11
12
  gem "csv"
12
13
  gem "logger"
data/README.md CHANGED
@@ -13,7 +13,7 @@ Requires Ruby 3.4 or above.
13
13
  Add this line to your application's Gemfile:
14
14
 
15
15
  ```ruby
16
- gem "bibliothecary", git: "https://github.com/ecosyste-ms/bibliothecary.git"
16
+ gem "ecosystems-bibliothecary", git: "https://github.com/ecosyste-ms/bibliothecary.git", require: "bibliothecary"
17
17
  ```
18
18
 
19
19
  And then execute:
data/bin/benchmark ADDED
@@ -0,0 +1,386 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "bibliothecary"
6
+ require "benchmark"
7
+ require "optparse"
8
+
9
+ class InfrastructureBenchmark
10
+ FIXTURES_DIR = File.expand_path("../spec/fixtures", __dir__)
11
+
12
+ def initialize(iterations: 100)
13
+ @iterations = iterations
14
+ end
15
+
16
+ def run
17
+ puts "Infrastructure Benchmark"
18
+ puts "=" * 60
19
+ puts "Iterations: #{@iterations}"
20
+ puts
21
+
22
+ # Prepare test data - mix of different file types
23
+ test_files = prepare_test_files
24
+ puts "Test files: #{test_files.length}"
25
+ puts
26
+
27
+ benchmark_package_managers
28
+ benchmark_matching_overhead
29
+ benchmark_load_file_info_list_from_contents(test_files)
30
+ benchmark_analyse_file(test_files)
31
+ end
32
+
33
+ def prepare_test_files
34
+ files = []
35
+ # Get a representative sample of fixtures
36
+ %w[package.json Gemfile.lock Cargo.toml pom.xml requirements.txt go.mod].each do |name|
37
+ path = Dir.glob("#{FIXTURES_DIR}/**/#{name}").first
38
+ next unless path
39
+
40
+ files << { file_path: path.sub("#{FIXTURES_DIR}/", ""), contents: File.read(path) }
41
+ end
42
+ files
43
+ end
44
+
45
+ def benchmark_package_managers
46
+ puts "package_managers method:"
47
+ puts "-" * 40
48
+
49
+ runner = Bibliothecary.runner
50
+
51
+ # Warm up
52
+ 5.times { runner.package_managers }
53
+
54
+ time = Benchmark.measure do
55
+ @iterations.times { runner.package_managers }
56
+ end
57
+
58
+ printf " %d calls: %.3f ms total, %.4f ms/call\n",
59
+ @iterations, time.real * 1000, (time.real / @iterations) * 1000
60
+ puts
61
+ end
62
+
63
+ def benchmark_load_file_info_list_from_contents(test_files)
64
+ puts "load_file_info_list_from_contents:"
65
+ puts "-" * 40
66
+
67
+ runner = Bibliothecary.runner
68
+
69
+ # Warm up
70
+ 5.times { runner.load_file_info_list_from_contents(test_files) }
71
+
72
+ time = Benchmark.measure do
73
+ @iterations.times { runner.load_file_info_list_from_contents(test_files) }
74
+ end
75
+
76
+ total_files = test_files.length * @iterations
77
+ printf " %d calls (%d files each): %.3f ms total\n",
78
+ @iterations, test_files.length, time.real * 1000
79
+ printf " %.4f ms/call, %.4f ms/file\n",
80
+ (time.real / @iterations) * 1000,
81
+ (time.real / total_files) * 1000
82
+ puts
83
+ end
84
+
85
+ def benchmark_analyse_file(test_files)
86
+ puts "analyse_file (full pipeline):"
87
+ puts "-" * 40
88
+
89
+ runner = Bibliothecary.runner
90
+
91
+ test_files.each do |file|
92
+ # Warm up
93
+ 3.times { runner.analyse_file(file[:file_path], file[:contents]) }
94
+
95
+ time = Benchmark.measure do
96
+ @iterations.times { runner.analyse_file(file[:file_path], file[:contents]) }
97
+ end
98
+
99
+ printf " %-30s %.4f ms/call\n",
100
+ File.basename(file[:file_path]),
101
+ (time.real / @iterations) * 1000
102
+ end
103
+ puts
104
+ end
105
+
106
+ def benchmark_matching_overhead
107
+ puts "Matching overhead breakdown:"
108
+ puts "-" * 40
109
+
110
+ runner = Bibliothecary.runner
111
+ pms = runner.package_managers
112
+
113
+ # Test with a simple package.json
114
+ test_file = { file_path: "package.json", contents: '{"dependencies":{}}' }
115
+ info = Bibliothecary::FileInfo.new(nil, test_file[:file_path], test_file[:contents])
116
+
117
+ # Benchmark match_info? across all parsers
118
+ time = Benchmark.measure do
119
+ @iterations.times do
120
+ pms.each { |pm| pm.match_info?(info) }
121
+ end
122
+ end
123
+ printf " match_info? x %d parsers: %.4f ms/file\n",
124
+ pms.length, (time.real / @iterations) * 1000
125
+
126
+ # Benchmark just the npm parser's match_info?
127
+ npm = pms.find { |pm| pm.platform_name == "npm" }
128
+ time = Benchmark.measure do
129
+ @iterations.times { npm.match_info?(info) }
130
+ end
131
+ printf " npm.match_info? alone: %.4f ms/call\n",
132
+ (time.real / @iterations) * 1000
133
+
134
+ # Benchmark first_matching_mapping_details (called multiple times per file)
135
+ time = Benchmark.measure do
136
+ @iterations.times do
137
+ npm.send(:first_matching_mapping_details, info)
138
+ end
139
+ end
140
+ printf " first_matching_mapping_details: %.4f ms/call\n",
141
+ (time.real / @iterations) * 1000
142
+
143
+ puts
144
+ end
145
+ end
146
+
147
+ class ParserBenchmark
148
+ FIXTURES_DIR = File.expand_path("../spec/fixtures", __dir__)
149
+
150
+ # Parser methods that require remote services
151
+ REMOTE_PARSERS = {
152
+ "swiftpm" => [:parse_package_swift],
153
+ "hackage" => [:parse_cabal],
154
+ "hex" => [:parse_mix, :parse_mix_lock],
155
+ "carthage" => [:parse_cartfile, :parse_cartfile_private, :parse_cartfile_resolved],
156
+ "clojars" => [:parse_manifest],
157
+ }.freeze
158
+
159
+ # Multi-parser methods shared across many package managers
160
+ MULTI_PARSER_METHODS = %i[
161
+ parse_cyclonedx_json
162
+ parse_cyclonedx_xml
163
+ parse_spdx_json
164
+ parse_spdx_tag_value
165
+ parse_dependencies_csv
166
+ ].freeze
167
+
168
+ def initialize(options = {})
169
+ @iterations = options.fetch(:iterations, 100)
170
+ @parser_filter = options[:parser]
171
+ @verbose = options[:verbose]
172
+ @native_only = options[:native_only]
173
+ @results = {}
174
+ end
175
+
176
+ def run
177
+ puts "Bibliothecary Parser Benchmark"
178
+ puts "=" * 60
179
+ puts "Iterations per file: #{@iterations}"
180
+ puts "Fixtures directory: #{FIXTURES_DIR}"
181
+ puts
182
+
183
+ parsers = filtered_parsers
184
+ puts "Running benchmarks for #{parsers.length} parser(s)..."
185
+ puts
186
+
187
+ parsers.each do |parser|
188
+ benchmark_parser(parser)
189
+ end
190
+
191
+ print_summary
192
+ end
193
+
194
+ def filtered_parsers
195
+ all_parsers = Bibliothecary.package_managers
196
+ return all_parsers unless @parser_filter
197
+
198
+ matching = all_parsers.select do |pm|
199
+ pm.platform_name.downcase.include?(@parser_filter.downcase)
200
+ end
201
+
202
+ if matching.empty?
203
+ puts "No parser matching '#{@parser_filter}' found."
204
+ puts "Available parsers: #{all_parsers.map(&:platform_name).join(', ')}"
205
+ exit 1
206
+ end
207
+
208
+ matching
209
+ end
210
+
211
+ def benchmark_parser(parser)
212
+ platform = parser.platform_name
213
+ mapping = parser.mapping
214
+ fixtures = find_fixtures_for_parser(mapping, platform)
215
+
216
+ if fixtures.empty?
217
+ puts "#{platform}: no matching fixtures found"
218
+ puts if @verbose
219
+ return
220
+ end
221
+
222
+ puts "#{platform} (#{fixtures.length} files)"
223
+ puts "-" * 40
224
+
225
+ parser_total = 0
226
+ file_results = []
227
+
228
+ fixtures.each do |fixture_path, mapping_entry|
229
+ contents = File.read(fixture_path)
230
+ filename = File.basename(fixture_path)
231
+ relative = fixture_path.sub("#{FIXTURES_DIR}/", "")
232
+
233
+ begin
234
+ time = Benchmark.measure do
235
+ @iterations.times do
236
+ parser.send(mapping_entry[:parser], contents, options: { filename: filename })
237
+ end
238
+ end
239
+
240
+ avg_ms = (time.real / @iterations) * 1000
241
+ parser_total += time.real
242
+
243
+ file_results << {
244
+ file: relative,
245
+ total: time.real,
246
+ avg_ms: avg_ms,
247
+ kind: mapping_entry[:kind],
248
+ }
249
+
250
+ if @verbose
251
+ printf " %-40s %8.3f ms/call (%s)\n", relative, avg_ms, mapping_entry[:kind]
252
+ end
253
+ rescue Bibliothecary::RemoteParsingError => e
254
+ puts " #{relative}: skipped (remote parser unavailable)"
255
+ rescue => e
256
+ puts " #{relative}: error - #{e.class}: #{e.message}"
257
+ end
258
+ end
259
+
260
+ unless @verbose
261
+ file_results.sort_by { |r| -r[:avg_ms] }.first(3).each do |r|
262
+ printf " %-40s %8.3f ms/call\n", r[:file], r[:avg_ms]
263
+ end
264
+ puts " ..." if file_results.length > 3
265
+ end
266
+
267
+ avg_total = (parser_total / fixtures.length / @iterations) * 1000
268
+ printf " Total: %.3f ms avg per file\n", avg_total
269
+ puts
270
+
271
+ @results[platform] = {
272
+ files: fixtures.length,
273
+ total_time: parser_total,
274
+ avg_per_file: avg_total,
275
+ file_results: file_results,
276
+ }
277
+ end
278
+
279
+ def find_fixtures_for_parser(mapping, platform_name)
280
+ fixtures = []
281
+ remote_methods = REMOTE_PARSERS[platform_name] || []
282
+
283
+ all_fixtures.each do |fixture_path|
284
+ filename = File.basename(fixture_path)
285
+ relative_path = fixture_path.sub("#{FIXTURES_DIR}/", "")
286
+
287
+ mapping.each do |matcher, entry|
288
+ next unless entry[:parser]
289
+ next if remote_methods.include?(entry[:parser])
290
+ next if @native_only && MULTI_PARSER_METHODS.include?(entry[:parser])
291
+
292
+ if matcher_matches?(matcher, filename, fixture_path)
293
+ fixtures << [fixture_path, entry]
294
+ break
295
+ end
296
+ end
297
+ end
298
+
299
+ fixtures
300
+ end
301
+
302
+ def matcher_matches?(matcher, filename, full_path)
303
+ relative_path = full_path.sub("#{FIXTURES_DIR}/", "")
304
+
305
+ case matcher
306
+ when Regexp
307
+ filename.match?(matcher)
308
+ when String
309
+ filename == matcher
310
+ when Proc
311
+ matcher.call(relative_path)
312
+ else
313
+ false
314
+ end
315
+ end
316
+
317
+ def all_fixtures
318
+ @all_fixtures ||= Dir.glob("#{FIXTURES_DIR}/**/*")
319
+ .select { |f| File.file?(f) }
320
+ .reject { |f| f.include?("/broken/") }
321
+ end
322
+
323
+ def print_summary
324
+ return if @results.empty?
325
+
326
+ puts "=" * 60
327
+ puts "Summary (sorted by avg time per file)"
328
+ puts "=" * 60
329
+
330
+ sorted = @results.sort_by { |_, v| -v[:avg_per_file] }
331
+
332
+ printf "%-20s %10s %12s\n", "Parser", "Files", "Avg ms/file"
333
+ printf "%-20s %10s %12s\n", "-" * 20, "-" * 10, "-" * 12
334
+
335
+ sorted.each do |platform, data|
336
+ printf "%-20s %10d %12.3f\n", platform, data[:files], data[:avg_per_file]
337
+ end
338
+
339
+ puts
340
+ total_files = @results.values.sum { |v| v[:files] }
341
+ total_time = @results.values.sum { |v| v[:total_time] }
342
+ puts "Total: #{total_files} files, #{(total_time * 1000).round(1)} ms total time"
343
+ end
344
+ end
345
+
346
+ options = {
347
+ iterations: 100,
348
+ verbose: false,
349
+ mode: :parsers,
350
+ }
351
+
352
+ OptionParser.new do |opts|
353
+ opts.banner = "Usage: bin/benchmark [options]"
354
+
355
+ opts.on("-p", "--parser NAME", "Only benchmark parsers matching NAME") do |p|
356
+ options[:parser] = p
357
+ end
358
+
359
+ opts.on("-n", "--iterations N", Integer, "Number of iterations per file (default: 100)") do |n|
360
+ options[:iterations] = n
361
+ end
362
+
363
+ opts.on("-v", "--verbose", "Show all files, not just slowest") do
364
+ options[:verbose] = true
365
+ end
366
+
367
+ opts.on("--native-only", "Exclude shared multi-parsers (CycloneDX, SPDX, CSV)") do
368
+ options[:native_only] = true
369
+ end
370
+
371
+ opts.on("--infra", "Benchmark infrastructure (load_file_info_list, etc)") do
372
+ options[:mode] = :infra
373
+ end
374
+
375
+ opts.on("-h", "--help", "Show this help") do
376
+ puts opts
377
+ exit
378
+ end
379
+ end.parse!
380
+
381
+ case options[:mode]
382
+ when :infra
383
+ InfrastructureBenchmark.new(iterations: options[:iterations]).run
384
+ else
385
+ ParserBenchmark.new(options).run
386
+ end
@@ -40,7 +40,7 @@ module Bibliothecary
40
40
  # If your Parser needs to return multiple responses for one file, please override this method
41
41
  # For example see conda.rb
42
42
  kind = determine_kind_from_info(info)
43
- parser_result = parse_file(info.relative_path, info.contents, options: options)
43
+ parser_result = parse_file_info(info, options: options)
44
44
  parser_result = ParserResult.new(dependencies: []) if parser_result.nil? # work around any legacy parsers that return nil
45
45
 
46
46
  Bibliothecary::Analyser.create_analysis(platform_name, info.relative_path, kind, parser_result)
@@ -52,26 +52,31 @@ module Bibliothecary
52
52
  # Call the matching parse class method for this file with
53
53
  # these contents
54
54
  def parse_file(filename, contents, options: {})
55
- details = first_matching_mapping_details(FileInfo.new(nil, filename, contents))
55
+ parse_file_info(FileInfo.new(nil, filename, contents), options: options)
56
+ end
57
+
58
+ # Parse a file using its FileInfo object, reusing cached mapping details.
59
+ def parse_file_info(info, options: {})
60
+ details = first_matching_mapping_details(info)
56
61
 
57
62
  # this can be raised if we don't check match?/match_info?,
58
63
  # OR don't have the file contents when we check them, so
59
64
  # it turns out for example that a .xml file isn't a
60
65
  # manifest after all.
61
- raise Bibliothecary::FileParsingError.new("No parser for this file type", filename) unless details[:parser]
66
+ raise Bibliothecary::FileParsingError.new("No parser for this file type", info.relative_path) unless details[:parser]
62
67
 
63
68
  # The `parser` method should raise an exception if the file is malformed,
64
69
  # should return empty [] if the file is fine but simply doesn't contain
65
70
  # any dependencies, and should never return nil. At the time of writing
66
71
  # this comment, some of the parsers return [] or nil to mean an error
67
72
  # which is confusing to users.
68
- send(details[:parser], contents, options: options.merge(filename: filename))
73
+ send(details[:parser], info.contents, options: options.merge(filename: info.relative_path))
69
74
  rescue Exception => e # default is StandardError but C bindings throw Exceptions # rubocop:disable Lint/RescueException
70
75
  # the C xml parser also puts a newline at the end of the message
71
76
  location = e.backtrace_locations[0]
72
77
  .to_s
73
78
  .then { |l| l =~ /bibliothecary\// ? l.split("bibliothecary/").last : l.split("gems/").last }
74
- raise Bibliothecary::FileParsingError.new(e.message.strip, filename, location)
79
+ raise Bibliothecary::FileParsingError.new(e.message.strip, info.relative_path, location)
75
80
  end
76
81
 
77
82
  private
@@ -52,12 +52,14 @@ module Bibliothecary
52
52
  first_matching_mapping_details(info).any?
53
53
  end
54
54
 
55
- private
56
-
55
+ # Get mapping details for this file, using cache if available.
56
+ # The cache is stored on the FileInfo object to avoid repeated lookups.
57
57
  def first_matching_mapping_details(info)
58
- mapping
59
- .find { |matcher, details| mapping_entry_match?(matcher, details, info) }
60
- &.last || {}
58
+ info.cached_mapping_details(self) do
59
+ mapping
60
+ .find { |matcher, details| mapping_entry_match?(matcher, details, info) }
61
+ &.last || {}
62
+ end
61
63
  end
62
64
  end
63
65
  end
@@ -78,6 +78,7 @@ module Bibliothecary
78
78
 
79
79
  original_mapping = mapping
80
80
 
81
+ singleton_class.remove_method(:mapping)
81
82
  define_singleton_method(:mapping) do
82
83
  original_mapping.merge(klass.mapping)
83
84
  end
@@ -46,10 +46,17 @@ module Bibliothecary
46
46
  @contents = contents
47
47
 
48
48
  @package_manager = nil
49
+ @mapping_cache = {}
49
50
  end
50
51
 
51
52
  def groupable?
52
53
  @package_manager&.groupable?(self)
53
54
  end
55
+
56
+ # Cache and retrieve mapping details for a given package manager class.
57
+ # This avoids repeatedly calling first_matching_mapping_details.
58
+ def cached_mapping_details(package_manager_class)
59
+ @mapping_cache[package_manager_class] ||= yield
60
+ end
54
61
  end
55
62
  end
@@ -201,7 +201,7 @@ module Bibliothecary
201
201
 
202
202
  def self.parse_paket_lock(file_contents, options: {})
203
203
  lines = file_contents.split("\n")
204
- package_version_re = /\s+(?<name>\S+)\s\((?<version>\d+\.\d+[.\d+[.\d+]*]*)\)/
204
+ package_version_re = /\s+(?<name>\S+)\s\((?<version>\d+(?:\.\d+)+)\)/
205
205
  packages = lines.select { |line| package_version_re.match(line) }.map { |line| package_version_re.match(line) }.map do |match|
206
206
  Dependency.new(
207
207
  name: match[:name].strip,
@@ -40,12 +40,160 @@ module Bibliothecary
40
40
  end
41
41
 
42
42
  def applicable_package_managers(info)
43
- managers = package_managers.select { |pm| pm.match_info?(info) }
43
+ candidates = candidate_package_managers(info.relative_path)
44
+ managers = candidates.select { |pm| pm.match_info?(info) }
44
45
  managers.empty? ? [nil] : managers
45
46
  end
46
47
 
47
48
  def package_managers
48
- Bibliothecary::Parsers.constants.map { |c| Bibliothecary::Parsers.const_get(c) }.sort_by { |c| c.to_s.downcase }
49
+ @package_managers ||= Bibliothecary::Parsers.constants
50
+ .map { |c| Bibliothecary::Parsers.const_get(c) }
51
+ .sort_by { |c| c.to_s.downcase }
52
+ .freeze
53
+ end
54
+
55
+ # Get candidate package managers for a file path using filename/extension index.
56
+ # Falls back to all package managers for unindexed patterns.
57
+ def candidate_package_managers(path)
58
+ filename = File.basename(path)
59
+ filename_lower = filename.downcase
60
+
61
+ # Check exact filename match first (use fetch to avoid default block on frozen hash)
62
+ candidates = filename_index.fetch(filename_lower, nil)
63
+ return candidates if candidates
64
+
65
+ # Check extension matches
66
+ extension_index.each do |ext, ext_candidates|
67
+ return ext_candidates if filename_lower.end_with?(ext)
68
+ end
69
+
70
+ # Fall back to all package managers for unindexed patterns
71
+ package_managers
72
+ end
73
+
74
+ # Build an index mapping lowercase filenames to candidate parsers
75
+ def filename_index
76
+ @filename_index ||= build_filename_index
77
+ end
78
+
79
+ # Build an index mapping lowercase extensions to candidate parsers
80
+ def extension_index
81
+ @extension_index ||= build_extension_index
82
+ end
83
+
84
+ def build_filename_index
85
+ index = {}
86
+
87
+ package_managers.each do |pm|
88
+ pm.mapping.each_key do |matcher|
89
+ next unless matcher.is_a?(Proc)
90
+
91
+ # Extract filenames from the matcher by testing common patterns
92
+ extract_filenames_from_matcher(matcher).each do |filename|
93
+ key = filename.downcase
94
+ index[key] ||= []
95
+ index[key] << pm
96
+ end
97
+ end
98
+ end
99
+
100
+ # Deduplicate and freeze
101
+ index.transform_values! { |v| v.uniq.freeze }
102
+ index.freeze
103
+ end
104
+
105
+ def build_extension_index
106
+ index = {}
107
+
108
+ package_managers.each do |pm|
109
+ pm.mapping.each_key do |matcher|
110
+ next unless matcher.is_a?(Proc)
111
+
112
+ # Extract extensions from the matcher
113
+ extract_extensions_from_matcher(matcher).each do |ext|
114
+ key = ext.downcase
115
+ index[key] ||= []
116
+ index[key] << pm
117
+ end
118
+ end
119
+ end
120
+
121
+ # Deduplicate and freeze
122
+ index.transform_values! { |v| v.uniq.freeze }
123
+ index.freeze
124
+ end
125
+
126
+ # Try to extract filename patterns from a matcher proc
127
+ def extract_filenames_from_matcher(matcher)
128
+ filenames = []
129
+
130
+ # Test common manifest filenames to see which ones match
131
+ common_filenames.each do |filename|
132
+ filenames << filename if matcher.call(filename)
133
+ end
134
+
135
+ filenames
136
+ end
137
+
138
+ # Try to extract extension patterns from a matcher proc
139
+ def extract_extensions_from_matcher(matcher)
140
+ extensions = []
141
+
142
+ # Test common extensions
143
+ common_extensions.each do |ext|
144
+ test_file = "test#{ext}"
145
+ extensions << ext if matcher.call(test_file)
146
+ end
147
+
148
+ extensions
149
+ end
150
+
151
+ def common_filenames
152
+ @common_filenames ||= %w[
153
+ package.json package-lock.json yarn.lock pnpm-lock.yaml npm-shrinkwrap.json npm-ls.json bun.lock
154
+ Gemfile Gemfile.lock gems.rb gems.locked
155
+ Cargo.toml Cargo.lock
156
+ go.mod go.sum Gopkg.toml Gopkg.lock glide.yaml glide.lock Godeps
157
+ requirements.txt Pipfile Pipfile.lock pyproject.toml poetry.lock setup.py
158
+ pom.xml build.gradle build.gradle.kts ivy.xml
159
+ composer.json composer.lock
160
+ Podfile Podfile.lock
161
+ pubspec.yaml pubspec.lock
162
+ Package.swift Package.resolved
163
+ Cartfile Cartfile.resolved Cartfile.private
164
+ mix.exs mix.lock
165
+ project.clj
166
+ shard.yml shard.lock
167
+ environment.yml environment.yaml
168
+ bower.json
169
+ elm-package.json elm.json
170
+ vcpkg.json
171
+ dub.json dub.sdl
172
+ haxelib.json
173
+ action.yml action.yaml
174
+ Brewfile Brewfile.lock.json
175
+ REQUIRE Project.toml Manifest.toml
176
+ paket.lock packages.config Project.json Project.lock.json packages.lock.json project.assets.json
177
+ DESCRIPTION
178
+ META.json META.yml cpanfile
179
+ cabal.config
180
+ cyclonedx.json cyclonedx.xml
181
+ dependencies.csv
182
+ docker-compose.yml docker-compose.yaml Dockerfile
183
+ MLmodel
184
+ Modelfile
185
+ dvc.yaml
186
+ cog.yaml
187
+ bentofile.yaml
188
+ uv.lock pylock.toml
189
+ ].freeze
190
+ end
191
+
192
+ def common_extensions
193
+ @common_extensions ||= %w[
194
+ .gemspec .nuspec .csproj .cabal .podspec .podspec.json
195
+ .spdx .cdx.json .cdx.xml
196
+ ].freeze
49
197
  end
50
198
 
51
199
  # Parses an array of format [{file_path: "", contents: ""},] to match
@@ -120,7 +268,9 @@ module Bibliothecary
120
268
  def analyse_file(file_path, contents)
121
269
  contents = Bibliothecary.utf8_string(contents)
122
270
 
123
- package_managers.select { |pm| pm.match?(file_path, contents) }.map do |pm|
271
+ # Use filename index to quickly find candidate parsers
272
+ candidates = candidate_package_managers(file_path)
273
+ candidates.select { |pm| pm.match?(file_path, contents) }.map do |pm|
124
274
  pm.analyse_contents(file_path, contents, options: @options)
125
275
  end.flatten.uniq.compact
126
276
  end
@@ -137,14 +287,24 @@ module Bibliothecary
137
287
  ignored_dirs.include?(f) || f.start_with?(*ignored_dirs_with_slash)
138
288
  end
139
289
  allowed_file_list = allowed_file_list.reject { |f| ignored_files.include?(f) }
140
- package_managers.map do |pm|
141
- # (skip rubocop false positive, since match? is a custom method)
142
- allowed_file_list.select do |file_path| # rubocop:disable Style/SelectByRegexp
143
- # this is a call to match? without file contents, which will skip
144
- # ambiguous filenames that are only possibly a manifest
145
- pm.match?(file_path)
290
+
291
+ # Fast path: use filename index directly for known manifest filenames
292
+ # This avoids creating FileInfo objects and calling match? for each file
293
+ manifests = []
294
+ allowed_file_list.each do |file_path|
295
+ filename_lower = File.basename(file_path).downcase
296
+
297
+ # Check if this filename is in our index (known manifest)
298
+ if filename_index.key?(filename_lower)
299
+ manifests << file_path
300
+ next
146
301
  end
147
- end.flatten.uniq.compact
302
+
303
+ # Check extension index
304
+ matched = extension_index.keys.any? { |ext| filename_lower.end_with?(ext) }
305
+ manifests << file_path if matched
306
+ end
307
+ manifests.sort
148
308
  end
149
309
 
150
310
  def ignored_dirs
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bibliothecary
4
- VERSION = "14.2.0"
4
+ VERSION = "14.3.0"
5
5
  end
data/lib/bibliothecary.rb CHANGED
@@ -100,19 +100,16 @@ module Bibliothecary
100
100
  end
101
101
 
102
102
  def self.runner
103
- configuration
104
- @runner
103
+ @runner ||= Runner.new(configuration)
105
104
  end
106
105
 
107
106
  def self.configuration
108
107
  @configuration ||= Configuration.new
109
- @runner = Runner.new(@configuration)
110
- @configuration
111
108
  end
112
109
 
113
110
  def self.reset
114
- @configuration = Configuration.new
115
- @runner = Runner.new(@configuration)
111
+ @configuration = nil
112
+ @runner = nil
116
113
  end
117
114
 
118
115
  def self.configure
@@ -4,7 +4,7 @@ class DockerfileParser
4
4
  end
5
5
 
6
6
  def parse
7
- fromlines = @file_contents.split("\n").select { |line| line.strip =~ /^\FROM/i }
7
+ fromlines = @file_contents.split("\n").select { |line| line.strip =~ /^FROM/i }
8
8
 
9
9
  fromlines.map do |line|
10
10
  line = line.strip.split(' ')
@@ -4,10 +4,10 @@ class ModelfileParser
4
4
  end
5
5
 
6
6
  def parse
7
- fromlines = @file_contents.split("\n").select { |line| line.strip =~ /^\FROM/i }
7
+ fromlines = @file_contents.split("\n").select { |line| line.strip =~ /^FROM/i }
8
8
 
9
9
  fromlines.map do |line|
10
- line = line.strip.split(' ')
10
+ line = line.strip.split
11
11
 
12
12
  # Remove the FROM keyword
13
13
  line.shift
@@ -23,19 +23,19 @@ class ModelfileParser
23
23
  model_ref = line[0]
24
24
 
25
25
  # Check if it's a file path (local GGUF or directory)
26
- if model_ref =~ /\.(gguf|safetensors)$/i || model_ref.start_with?('./', '/')
26
+ if model_ref =~ /\.(gguf|safetensors)$/i || model_ref.start_with?("./", "/")
27
27
  {
28
28
  name: File.basename(model_ref),
29
- requirement: 'local',
30
- type: 'runtime'
29
+ requirement: "local",
30
+ type: "runtime",
31
31
  }
32
32
  else
33
33
  # It's a registry model (e.g., llama3.2 or llama3.2:latest)
34
- parts = model_ref.split(':')
34
+ parts = model_ref.split(":")
35
35
  {
36
36
  name: parts[0],
37
- requirement: parts[1] || 'latest',
38
- type: 'runtime'
37
+ requirement: parts[1] || "latest",
38
+ type: "runtime",
39
39
  }
40
40
  end
41
41
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ecosystems-bibliothecary
3
3
  version: !ruby/object:Gem::Version
4
- version: 14.2.0
4
+ version: 14.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Nesbitt
@@ -180,6 +180,7 @@ dependencies:
180
180
  email:
181
181
  - andrewnez@gmail.com
182
182
  executables:
183
+ - benchmark
183
184
  - bibliothecary
184
185
  - console
185
186
  - setup
@@ -201,6 +202,7 @@ files:
201
202
  - README.md
202
203
  - Rakefile
203
204
  - bibliothecary.gemspec
205
+ - bin/benchmark
204
206
  - bin/bibliothecary
205
207
  - bin/console
206
208
  - bin/setup