github-linguist 7.30.0 → 8.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/grammars/inline.edgeql.json +1 -1
  3. data/grammars/inline.peggy.json +1 -0
  4. data/grammars/markdown.move.codeblock.json +1 -1
  5. data/grammars/mdx.move.codeblock.json +1 -0
  6. data/grammars/source.abl.json +1 -1
  7. data/grammars/source.bicep.json +1 -1
  8. data/grammars/source.cairo.json +1 -1
  9. data/grammars/source.cairo0.json +1 -0
  10. data/grammars/source.cl.json +1 -1
  11. data/grammars/source.clar.json +1 -1
  12. data/grammars/source.clarion.json +1 -1
  13. data/grammars/source.cmd.json +1 -1
  14. data/grammars/source.commonlisp.json +1 -1
  15. data/grammars/source.cs.json +1 -1
  16. data/grammars/source.curlrc.json +1 -1
  17. data/grammars/source.curry.json +1 -1
  18. data/grammars/source.cylc.json +1 -0
  19. data/grammars/source.dart.json +1 -1
  20. data/grammars/source.dds.dspf.json +1 -1
  21. data/grammars/source.dds.icff.json +1 -1
  22. data/grammars/source.dds.lf.json +1 -1
  23. data/grammars/source.dds.pf.json +1 -1
  24. data/grammars/source.dds.prtf.json +1 -1
  25. data/grammars/source.dune.json +1 -0
  26. data/grammars/source.elvish.json +1 -1
  27. data/grammars/source.fsharp.json +1 -1
  28. data/grammars/source.gdscript.json +1 -1
  29. data/grammars/source.gitconfig.json +1 -1
  30. data/grammars/source.gleam.json +1 -1
  31. data/grammars/source.hgignore.json +1 -1
  32. data/grammars/source.hosts.json +1 -1
  33. data/grammars/source.iCalendar.json +1 -0
  34. data/grammars/source.ice.json +1 -1
  35. data/grammars/source.julia.json +1 -1
  36. data/grammars/source.just.json +1 -1
  37. data/grammars/source.lcb.json +1 -0
  38. data/grammars/source.lilypond.json +1 -1
  39. data/grammars/source.livecodescript.json +1 -0
  40. data/grammars/source.luau.json +1 -1
  41. data/grammars/source.matlab.json +1 -1
  42. data/grammars/source.mcfunction.json +1 -1
  43. data/grammars/source.mdx.json +1 -1
  44. data/grammars/source.mo.json +1 -1
  45. data/grammars/source.move.json +1 -1
  46. data/grammars/source.nanorc.json +1 -1
  47. data/grammars/source.nr.json +1 -0
  48. data/grammars/source.p4.json +1 -1
  49. data/grammars/source.peggy.json +1 -0
  50. data/grammars/source.polar.json +1 -1
  51. data/grammars/source.powerbuilder.json +1 -0
  52. data/grammars/source.qsharp.json +1 -1
  53. data/grammars/source.rpgle.json +1 -1
  54. data/grammars/source.rust.json +1 -1
  55. data/grammars/source.sentinel.json +1 -1
  56. data/grammars/source.sourcepawn.json +1 -1
  57. data/grammars/source.stan.json +1 -1
  58. data/grammars/source.swift.json +1 -1
  59. data/grammars/source.sy.json +1 -1
  60. data/grammars/source.vba.json +1 -1
  61. data/grammars/source.vcard.json +1 -0
  62. data/grammars/source.wdl.json +1 -1
  63. data/grammars/text.adblock.json +1 -1
  64. data/grammars/text.html.jte.json +1 -0
  65. data/grammars/text.html.statamic.json +1 -1
  66. data/grammars/text.md.json +1 -1
  67. data/grammars/text.mdx.astro.codeblock.json +1 -0
  68. data/grammars/version +1 -1
  69. data/lib/linguist/VERSION +1 -1
  70. data/lib/linguist/classifier.rb +315 -106
  71. data/lib/linguist/generated.rb +17 -4
  72. data/lib/linguist/generic.yml +1 -0
  73. data/lib/linguist/heuristics.rb +6 -6
  74. data/lib/linguist/heuristics.yml +54 -4
  75. data/lib/linguist/languages.json +1 -1
  76. data/lib/linguist/languages.yml +123 -7
  77. data/lib/linguist/samples.json +1 -1
  78. data/lib/linguist/samples.rb +9 -1
  79. data/lib/linguist/sha256.rb +1 -1
  80. metadata +17 -5
  81. data/grammars/markdown.mcfunction.codeblock.json +0 -1
  82. data/grammars/mdx.LANGUAGE.codeblock.json +0 -1
@@ -0,0 +1 @@
1
+ {"scopeName":"text.mdx.astro.codeblock","patterns":[{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(`{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r`])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}},{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(~{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}}],"repository":{"astro-code-block":{"patterns":[{"contentName":"meta.embedded.block.astro.frontmatter","begin":"^\\s*---\\s*$","end":"^\\s*---\\s*$","patterns":[{"include":"source.tsx"}],"beginCaptures":{"0":{"name":"punctuation.definition.tag.xi.begin.t"}},"endCaptures":{"0":{"name":"punctuation.definition.tag.xi.end.t"}}},{"include":"source.astro"}]}}}
data/grammars/version CHANGED
@@ -1 +1 @@
1
- 7.30.0
1
+ 8.0.0
data/lib/linguist/VERSION CHANGED
@@ -1 +1 @@
1
- 7.30.0
1
+ 8.0.0
@@ -1,8 +1,12 @@
1
1
  require 'linguist/tokenizer'
2
+ require 'set'
2
3
 
3
4
  module Linguist
4
- # Language bayesian classifier.
5
+ # Language content classifier.
5
6
  class Classifier
7
+ # Maximum number of bytes to consider for classification.
8
+ # This is only used at evaluation time. During training, full content of
9
+ # samples is used.
6
10
  CLASSIFIER_CONSIDER_BYTES = 50 * 1024
7
11
 
8
12
  # Public: Use the classifier to detect language of the blob.
@@ -28,41 +32,59 @@ module Linguist
28
32
  #
29
33
  # db - Hash classifier database object
30
34
  # language - String language of data
31
- # data - String contents of file
35
+ # data - String contents of file or array of tokens.
32
36
  #
33
37
  # Examples
34
38
  #
35
- # Classifier.train(db, 'Ruby', "def hello; end")
39
+ # Classifier.train!(db, 'Ruby', "def hello; end")
36
40
  #
37
- # Returns nothing.
41
+ # Returns nil.
38
42
  #
39
- # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
40
- # per-language. See also #dump_all_tokens, below.
43
+ # Set LINGUIST_DEBUG=1, =2 or =3 to print internal statistics.
41
44
  def self.train!(db, language, data)
42
45
  tokens = data
43
46
  tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
44
47
 
45
- counts = Hash.new(0)
46
- tokens.each { |tok| counts[tok] += 1 }
48
+ db['vocabulary'] ||= {}
49
+ # Set hash to autoincremented index value
50
+ if db['vocabulary'].default_proc.nil?
51
+ db['vocabulary'].default_proc = proc do |hash, key|
52
+ hash[key] = hash.length
53
+ end
54
+ end
47
55
 
48
- db['tokens_total'] ||= 0
49
- db['languages_total'] ||= 0
50
- db['tokens'] ||= {}
51
- db['language_tokens'] ||= {}
52
- db['languages'] ||= {}
56
+ db['samples'] ||= {}
57
+ db['samples'][language] ||= []
53
58
 
54
- counts.each do |token, count|
55
- db['tokens'][language] ||= {}
56
- db['tokens'][language][token] ||= 0
57
- db['tokens'][language][token] += count
58
- db['language_tokens'][language] ||= 0
59
- db['language_tokens'][language] += count
60
- db['tokens_total'] += count
61
- end
62
- db['languages'][language] ||= 0
63
- db['languages'][language] += 1
64
- db['languages_total'] += 1
59
+ termfreq = to_vocabulary_index_termfreq(db['vocabulary'], tokens)
60
+ db['samples'][language] << termfreq
61
+
62
+ nil
63
+ end
64
+
65
+ # Public: Finalize training.
66
+ #
67
+ # db - Hash classifier database object
68
+ #
69
+ # Examples:
70
+ # Classifier.finalize_train!(db)
71
+ #
72
+ # Returns nil.
73
+ #
74
+ # This method must be called after the last #train! call.
75
+ def self.finalize_train!(db)
76
+ db['vocabulary'] ||= {}
77
+
78
+ # Unset hash autoincrement
79
+ db['vocabulary'].default_proc = nil
65
80
 
81
+ db['samples'] ||= []
82
+ filter_vocab_by_freq! db, MIN_DOCUMENT_FREQUENCY
83
+ sort_vocab! db
84
+ db['icf'] = inverse_class_freqs db
85
+ normalize_samples! db
86
+ db['centroids'] = get_centroids db
87
+ db.delete 'samples'
66
88
  nil
67
89
  end
68
90
 
@@ -78,20 +100,17 @@ module Linguist
78
100
  # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
79
101
  #
80
102
  # Returns sorted Array of result pairs. Each pair contains the
81
- # String language name and a Float score.
103
+ # String language name and a Float score between 0.0 and 1.0.
82
104
  def self.classify(db, tokens, languages = nil)
83
- languages ||= db['languages'].keys
105
+ languages ||= db['centroids'].keys
84
106
  new(db).classify(tokens, languages)
85
107
  end
86
108
 
87
109
  # Internal: Initialize a Classifier.
88
110
  def initialize(db = {})
89
- @tokens_total = db['tokens_total']
90
- @languages_total = db['languages_total']
91
- @tokens = db['tokens']
92
- @language_tokens = db['language_tokens']
93
- @languages = db['languages']
94
- @unknown_logprob = Math.log(1 / db['tokens_total'].to_f)
111
+ @vocabulary = db['vocabulary']
112
+ @centroids = db['centroids']
113
+ @icf = db['icf']
95
114
  end
96
115
 
97
116
  # Internal: Guess language of data
@@ -100,72 +119,70 @@ module Linguist
100
119
  # languages - Array of language name Strings to restrict to.
101
120
  #
102
121
  # Returns sorted Array of result pairs. Each pair contains the
103
- # String language name and a Float score.
122
+ # String language name and a Float score between 0.0 and 1.0.
104
123
  def classify(tokens, languages)
105
124
  return [] if tokens.nil? || languages.empty?
106
125
  tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
107
- scores = {}
108
126
 
109
- debug_dump_all_tokens(tokens, languages) if verbosity >= 2
127
+ debug_dump_tokens(tokens) if verbosity >= 3
110
128
 
111
- counts = Hash.new(0)
112
- tokens.each { |tok| counts[tok] += 1 }
129
+ vec = Classifier.to_vocabulary_index_termfreq_gaps(@vocabulary, tokens)
130
+ vec.each do |idx, freq|
131
+ tf = 1.0 + Math.log(freq)
132
+ vec[idx] = tf * @icf[idx]
133
+ end
134
+ return [] if vec.empty?
135
+ Classifier.l2_normalize!(vec)
113
136
 
137
+ scores = {}
114
138
  languages.each do |language|
115
- scores[language] = tokens_probability(counts, language) + language_probability(language)
116
- debug_dump_probabilities(counts, language, scores[language]) if verbosity >= 1
139
+ centroid = @centroids[language]
140
+ score = Classifier.similarity(vec, centroid)
141
+ if score > 0.0
142
+ scores[language] = score
143
+ end
117
144
  end
118
-
119
- scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
145
+ scores = scores.sort_by { |x| -x[1] }
146
+ debug_dump_all_tokens(tokens, scores) if verbosity >= 2
147
+ debug_dump_scores(scores) if verbosity >= 1
148
+ scores
120
149
  end
121
150
 
122
- # Internal: Probably of set of tokens in a language occurring - P(D | C)
123
- #
124
- # tokens - Array of String tokens.
125
- # language - Language to check.
126
- #
127
- # Returns Float between 0.0 and 1.0.
128
- def tokens_probability(counts, language)
129
- sum = 0
130
- counts.each do |token, count|
131
- sum += count * token_probability(token, language)
151
+ private
152
+ MIN_DOCUMENT_FREQUENCY = 2
153
+
154
+ def verbosity
155
+ @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
132
156
  end
133
- sum
134
- end
135
157
 
136
- # Internal: Log-probability of token in language occurring - P(F | C)
137
- #
138
- # token - String token.
139
- # language - Language to check.
140
- #
141
- # Returns Float.
142
- def token_probability(token, language)
143
- count = @tokens[language][token]
144
- if count.nil? || count == 0
145
- # This is usually the most common case, so we cache the result.
146
- @unknown_logprob
147
- else
148
- Math.log(count.to_f / @language_tokens[language].to_f)
158
+ def debug_dump_scores(scores)
159
+ headers = ["Language", "Score"]
160
+ rows = scores.map { |l, s| [l, "%.3f" % s] }
161
+ dump_table(headers, rows)
149
162
  end
150
- end
151
163
 
152
- # Internal: Probably of a language occurring - P(C)
153
- #
154
- # language - Language to check.
155
- #
156
- # Returns Float between 0.0 and 1.0.
157
- def language_probability(language)
158
- Math.log(@languages[language].to_f / @languages_total.to_f)
159
- end
164
+ def debug_dump_tokens(tokens)
165
+ counts = Hash.new(0)
166
+ tokens.each do |tok|
167
+ idx = @vocabulary[tok]
168
+ if not idx.nil?
169
+ counts[tok] += 1
170
+ end
171
+ end
160
172
 
161
- private
162
- def verbosity
163
- @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
164
- end
173
+ norm = Classifier.l2_norm(counts)
174
+ rows = counts.map do |tok, tf|
175
+ idx = @vocabulary[tok]
176
+ log_tf = 1.0 + Math.log(tf)
177
+ with_icf = log_tf * @icf[idx]
178
+ normalized = with_icf / norm
179
+ row = [tok, tf, "%.3f" % log_tf, "%.3f" % with_icf, "%.3f" % normalized]
180
+ [normalized, row]
181
+ end
165
182
 
166
- def debug_dump_probabilities(tokens, language, score)
167
- printf("%10s = %10.3f + %7.3f = %10.3f\n",
168
- language, tokens_probability(tokens, language), language_probability(language), score)
183
+ headers = ["Token", "TF", "log", "*ICF", "L2"]
184
+ rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
185
+ dump_table(headers, rows)
169
186
  end
170
187
 
171
188
  # Internal: show a table of probabilities for each <token,language> pair.
@@ -173,31 +190,223 @@ module Linguist
173
190
  # The number in each table entry is the number of "points" that each
174
191
  # token contributes toward the belief that the file under test is a
175
192
  # particular language. Points are additive.
176
- #
177
- # Points are the number of times a token appears in the file, times
178
- # how much more likely (log of probability ratio) that token is to
179
- # appear in one language vs. the least-likely language. Dashes
180
- # indicate the least-likely language (and zero points) for each token.
181
- def debug_dump_all_tokens(tokens, languages)
182
- maxlen = tokens.map { |tok| tok.size }.max
183
-
184
- printf "%#{maxlen}s", ""
185
- puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
186
-
187
- token_map = Hash.new(0)
188
- tokens.each { |tok| token_map[tok] += 1 }
189
-
190
- token_map.sort.each { |tok, count|
191
- arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
192
- min = arr.map { |a,b| b }.min
193
- if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
194
- printf "%#{maxlen}s%5d", tok, count
195
-
196
- puts arr.map { |ent|
197
- ent[1] == min ? " -" : sprintf("%10.3f", count * (ent[1] - min))
198
- }.join
193
+ def debug_dump_all_tokens(tokens, scores)
194
+ languages = scores.map { |l, _| l }
195
+
196
+ counts = Hash.new(0)
197
+ tokens.each do |tok|
198
+ idx = @vocabulary[tok]
199
+ if not idx.nil?
200
+ counts[tok] += 1
201
+ end
202
+ end
203
+
204
+ data = {}
205
+ norm = Classifier.l2_norm(counts)
206
+ languages.each do |language|
207
+ data[language] = {}
208
+ counts.each do |tok, tf|
209
+ idx = @vocabulary[tok]
210
+ log_tf = 1.0 + Math.log(tf)
211
+ with_icf = log_tf * @icf[idx]
212
+ normalized = with_icf / norm
213
+ data[language][tok] = normalized * @centroids[language][idx].to_f
214
+ end
215
+ end
216
+
217
+ norm = Classifier.l2_norm(counts)
218
+ rows = counts.map do |tok, tf|
219
+ idx = @vocabulary[tok]
220
+ log_tf = 1.0 + Math.log(tf)
221
+ with_icf = log_tf * @icf[idx]
222
+ normalized = with_icf / norm
223
+ scores = languages.map do |l, _|
224
+ [l, data[l][tok].to_f]
225
+ end
226
+ max_score = scores.to_h.values.max
227
+ row = [tok] + scores.map do |l, s|
228
+ if s == max_score
229
+ "%.4f*" % s
230
+ elsif s > 0.0
231
+ "%.4f" % s
232
+ else
233
+ "-"
234
+ end
235
+ end
236
+ [normalized, row]
237
+ end
238
+ headers = ["Token"] + (0..languages.length-1).map { |lidx| "[#{lidx}]" }
239
+ rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
240
+ legend = languages.each_with_index.map { |l, lidx| "[#{lidx}] = #{l}" }
241
+ dump_table(headers, rows, legend)
242
+ end
243
+
244
+ def dump_table(header, rows, legend = nil)
245
+ n_cols = header.length
246
+ rows = rows.map { |r| r.map { |c| c.to_s } }
247
+ col_widths = (0..n_cols - 1).map do |j|
248
+ ([header[j].length] + rows.map { |row| row[j].length }).max
249
+ end
250
+ sep_line = "| #{(0..n_cols-1).map { |j| "-" * col_widths[j] }.join(" | ")} |"
251
+ content_width = sep_line.length - 4
252
+ top_line = "| #{"-" * content_width} |"
253
+
254
+ format_row = Proc.new do |row|
255
+ cells = row.zip(col_widths).map do |cell, width|
256
+ "%-#{width}s" % cell
257
+ end
258
+ "| %s |" % cells.join(" | ")
259
+ end
260
+
261
+ puts top_line
262
+ puts format_row.call(header)
263
+ puts sep_line
264
+ rows.each do |row|
265
+ puts format_row.call(row)
266
+ end
267
+ puts top_line
268
+ if legend
269
+ legend.each do |line|
270
+ puts "| %-#{content_width}s |" % line
271
+ end
272
+ puts top_line
273
+ end
274
+ end
275
+
276
+ def self.to_vocabulary_index_termfreq(vocab, tokens)
277
+ counts = Hash.new(0)
278
+ tokens.each do |key|
279
+ idx = vocab[key]
280
+ counts[idx] += 1
281
+ end
282
+ counts
283
+ end
284
+
285
+ def self.to_vocabulary_index_termfreq_gaps(vocab, tokens)
286
+ counts = Hash.new(0)
287
+ tokens.each do |key|
288
+ if vocab.key? key
289
+ idx = vocab[key]
290
+ counts[idx] += 1
291
+ end
292
+ end
293
+ counts
294
+ end
295
+
296
+ def self.l2_norm(vec)
297
+ norm = vec.values.inject(0.0) { |sum, x| sum + x**2 }
298
+ Math.sqrt(norm)
299
+ end
300
+
301
+ def self.l2_normalize!(vec)
302
+ norm = l2_norm(vec)
303
+ vec.transform_values! do |value|
304
+ value.to_f / norm
305
+ end
306
+ nil
307
+ end
308
+
309
+ def self.similarity(a, b)
310
+ sum = 0.0
311
+ a.each_key do |idx|
312
+ if b.key? idx
313
+ sum += a[idx] * b[idx]
199
314
  end
200
- }
315
+ end
316
+ sum
201
317
  end
318
+
319
+ # Filter vocabulary by minimum document frequency.
320
+ def self.filter_vocab_by_freq!(db, min_freq)
321
+ vocabulary = db['vocabulary']
322
+
323
+ # Get document frequencies
324
+ docfreq = Array.new(vocabulary.size, 0)
325
+ db['samples'].each_value do |samples|
326
+ samples.each do |sample|
327
+ sample.each_key do |idx|
328
+ docfreq[idx] += 1
329
+ end
330
+ end
331
+ end
332
+
333
+ vocabulary.select! do |_, idx|
334
+ docfreq[idx] >= min_freq
335
+ end
336
+
337
+ nil
338
+ end
339
+
340
+ # Sort vocabulary lexicographically.
341
+ def self.sort_vocab!(db)
342
+ new_indices = Hash.new { |h,k| h[k] = h.length }
343
+ db['vocabulary'].sort_by { |x| x[0] }.each do |term, idx|
344
+ db['vocabulary'][term] = new_indices[idx]
345
+ end
346
+ new_indices.default_proc = nil
347
+
348
+ db['samples'].transform_values! do |samples|
349
+ samples.map do |sample|
350
+ new_sample = {}
351
+ sample.each do |idx, freq|
352
+ new_idx = new_indices[idx]
353
+ if not new_idx.nil?
354
+ new_sample[new_idx] = freq
355
+ end
356
+ end
357
+ new_sample
358
+ end
359
+ end
360
+ end
361
+
362
+ # Compute inverse class frequency (ICF) for every term.
363
+ def self.inverse_class_freqs(db)
364
+ icf = Array.new(db['vocabulary'].size, 0)
365
+ db['samples'].each_value do |samples|
366
+ terms = Set.new
367
+ samples.each do |sample|
368
+ terms |= sample.keys
369
+ end
370
+ terms.each do |idx|
371
+ icf[idx] += 1
372
+ end
373
+ end
374
+ icf.map! do |val|
375
+ Math.log(db['samples'].size.to_f / val.to_f) + 1
376
+ end
377
+ icf
378
+ end
379
+
380
+ def self.normalize_samples!(db)
381
+ icf = db['icf']
382
+ db['samples'].each_value do |samples|
383
+ samples.each do |sample|
384
+ sample.each do |idx, freq|
385
+ tf = 1.0 + Math.log(freq)
386
+ sample[idx] = tf * icf[idx]
387
+ end
388
+ l2_normalize! sample
389
+ end
390
+ end
391
+ end
392
+
393
+ def self.get_centroids(db)
394
+ centroids = {}
395
+ db['samples'].each do |language, samples|
396
+ centroid = Hash.new(0.0)
397
+ samples.each do |sample|
398
+ sample.each do |idx, val|
399
+ centroid[idx] += val
400
+ end
401
+ end
402
+ centroid.each_key do |idx|
403
+ centroid[idx] = centroid[idx] / samples.length
404
+ end
405
+ l2_normalize! centroid
406
+ centroids[language] = centroid
407
+ end
408
+ centroids
409
+ end
410
+
202
411
  end
203
412
  end
@@ -61,6 +61,7 @@ module Linguist
61
61
  composer_lock? ||
62
62
  cargo_lock? ||
63
63
  cargo_orig? ||
64
+ deno_lock? ||
64
65
  flake_lock? ||
65
66
  bazel_lock? ||
66
67
  node_modules? ||
@@ -68,6 +69,7 @@ module Linguist
68
69
  go_lock? ||
69
70
  poetry_lock? ||
70
71
  pdm_lock? ||
72
+ uv_lock? ||
71
73
  esy_lock? ||
72
74
  npm_shrinkwrap_or_package_lock? ||
73
75
  pnpm_lock? ||
@@ -422,6 +424,13 @@ module Linguist
422
424
  !!name.match(/pdm\.lock/)
423
425
  end
424
426
 
427
+ # Internal: Is the blob a generated uv.lock?
428
+ #
429
+ # Returns true or false.
430
+ def uv_lock?
431
+ !!name.match(/uv\.lock/)
432
+ end
433
+
425
434
  # Internal: Is the blob a generated esy lock file?
426
435
  #
427
436
  # Returns true or false.
@@ -429,6 +438,13 @@ module Linguist
429
438
  !!name.match(/(^|\/)(\w+\.)?esy.lock$/)
430
439
  end
431
440
 
441
+ # Internal: Is the blob a generated deno lockfile, which are not meant for humans in pull requests.
442
+ #
443
+ # Returns true or false.
444
+ def deno_lock?
445
+ !!name.match(/deno\.lock/)
446
+ end
447
+
432
448
  # Internal: Is the blob a generated npm shrinkwrap or package lock file?
433
449
  #
434
450
  # Returns true or false.
@@ -697,14 +713,11 @@ module Linguist
697
713
 
698
714
  # Internal: Is this a generated Game Maker Studio (2) metadata file?
699
715
  #
700
- # All Game Maker Studio 2 generated files will be JSON, .yy or .yyp, and have
701
- # a part that looks like "modelName: GMname" on the 3rd line
702
- #
703
716
  # Return true or false
704
717
  def generated_gamemakerstudio?
705
718
  return false unless ['.yy', '.yyp'].include? extname
706
719
  return false unless lines.count > 3
707
- return lines[2].match(/\"modelName\"\:\s*\"GM/) ||
720
+ return lines.first(3).join('').match?(/^\s*[\{\[]/) ||
708
721
  lines[0] =~ /^\d\.\d\.\d.+\|\{/
709
722
  end
710
723
 
@@ -16,6 +16,7 @@ extensions:
16
16
  - ".9"
17
17
  - ".app"
18
18
  - ".cmp"
19
+ - ".resource"
19
20
  - ".sol"
20
21
  - ".stl"
21
22
  - ".tag"
@@ -126,7 +126,7 @@ module Linguist
126
126
  # Internal: Perform the heuristic
127
127
  def call(data)
128
128
  matched = @rules.find do |rule|
129
- rule['pattern'].match(data)
129
+ rule['pattern'].match?(data)
130
130
  end
131
131
  if !matched.nil?
132
132
  languages = matched['language']
@@ -145,14 +145,14 @@ module Linguist
145
145
  @pats = pats
146
146
  end
147
147
 
148
- def match(input)
149
- return !@pats.any? { |pat| !pat.match(input) }
148
+ def match?(input)
149
+ return @pats.all? { |pat| pat.match?(input) }
150
150
  end
151
151
 
152
152
  end
153
153
 
154
154
  class AlwaysMatch
155
- def match(input)
155
+ def match?(input)
156
156
  return true
157
157
  end
158
158
  end
@@ -163,8 +163,8 @@ module Linguist
163
163
  @pat = pat
164
164
  end
165
165
 
166
- def match(input)
167
- return !@pat.match(input)
166
+ def match?(input)
167
+ return !@pat.match?(input)
168
168
  end
169
169
 
170
170
  end