github-linguist 7.30.0 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/grammars/inline.edgeql.json +1 -1
  3. data/grammars/inline.peggy.json +1 -0
  4. data/grammars/markdown.move.codeblock.json +1 -1
  5. data/grammars/mdx.move.codeblock.json +1 -0
  6. data/grammars/source.abl.json +1 -1
  7. data/grammars/source.bicep.json +1 -1
  8. data/grammars/source.cairo.json +1 -1
  9. data/grammars/source.cairo0.json +1 -0
  10. data/grammars/source.cl.json +1 -1
  11. data/grammars/source.clar.json +1 -1
  12. data/grammars/source.clarion.json +1 -1
  13. data/grammars/source.cmd.json +1 -1
  14. data/grammars/source.commonlisp.json +1 -1
  15. data/grammars/source.cs.json +1 -1
  16. data/grammars/source.curlrc.json +1 -1
  17. data/grammars/source.curry.json +1 -1
  18. data/grammars/source.cylc.json +1 -0
  19. data/grammars/source.dart.json +1 -1
  20. data/grammars/source.dds.dspf.json +1 -1
  21. data/grammars/source.dds.icff.json +1 -1
  22. data/grammars/source.dds.lf.json +1 -1
  23. data/grammars/source.dds.pf.json +1 -1
  24. data/grammars/source.dds.prtf.json +1 -1
  25. data/grammars/source.dune.json +1 -0
  26. data/grammars/source.elvish.json +1 -1
  27. data/grammars/source.fsharp.json +1 -1
  28. data/grammars/source.gdscript.json +1 -1
  29. data/grammars/source.gitconfig.json +1 -1
  30. data/grammars/source.gleam.json +1 -1
  31. data/grammars/source.hgignore.json +1 -1
  32. data/grammars/source.hosts.json +1 -1
  33. data/grammars/source.iCalendar.json +1 -0
  34. data/grammars/source.ice.json +1 -1
  35. data/grammars/source.julia.json +1 -1
  36. data/grammars/source.just.json +1 -1
  37. data/grammars/source.lcb.json +1 -0
  38. data/grammars/source.lilypond.json +1 -1
  39. data/grammars/source.livecodescript.json +1 -0
  40. data/grammars/source.luau.json +1 -1
  41. data/grammars/source.matlab.json +1 -1
  42. data/grammars/source.mcfunction.json +1 -1
  43. data/grammars/source.mdx.json +1 -1
  44. data/grammars/source.mo.json +1 -1
  45. data/grammars/source.move.json +1 -1
  46. data/grammars/source.nanorc.json +1 -1
  47. data/grammars/source.nr.json +1 -0
  48. data/grammars/source.p4.json +1 -1
  49. data/grammars/source.peggy.json +1 -0
  50. data/grammars/source.polar.json +1 -1
  51. data/grammars/source.powerbuilder.json +1 -0
  52. data/grammars/source.qsharp.json +1 -1
  53. data/grammars/source.rpgle.json +1 -1
  54. data/grammars/source.rust.json +1 -1
  55. data/grammars/source.sentinel.json +1 -1
  56. data/grammars/source.sourcepawn.json +1 -1
  57. data/grammars/source.stan.json +1 -1
  58. data/grammars/source.swift.json +1 -1
  59. data/grammars/source.sy.json +1 -1
  60. data/grammars/source.vba.json +1 -1
  61. data/grammars/source.vcard.json +1 -0
  62. data/grammars/source.wdl.json +1 -1
  63. data/grammars/text.adblock.json +1 -1
  64. data/grammars/text.html.jte.json +1 -0
  65. data/grammars/text.html.statamic.json +1 -1
  66. data/grammars/text.md.json +1 -1
  67. data/grammars/text.mdx.astro.codeblock.json +1 -0
  68. data/grammars/version +1 -1
  69. data/lib/linguist/VERSION +1 -1
  70. data/lib/linguist/classifier.rb +315 -106
  71. data/lib/linguist/generated.rb +17 -4
  72. data/lib/linguist/generic.yml +1 -0
  73. data/lib/linguist/heuristics.rb +6 -6
  74. data/lib/linguist/heuristics.yml +54 -4
  75. data/lib/linguist/languages.json +1 -1
  76. data/lib/linguist/languages.yml +123 -7
  77. data/lib/linguist/samples.json +1 -1
  78. data/lib/linguist/samples.rb +9 -1
  79. data/lib/linguist/sha256.rb +1 -1
  80. metadata +17 -5
  81. data/grammars/markdown.mcfunction.codeblock.json +0 -1
  82. data/grammars/mdx.LANGUAGE.codeblock.json +0 -1
@@ -0,0 +1 @@
1
+ {"scopeName":"text.mdx.astro.codeblock","patterns":[{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(`{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r`])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}},{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(~{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}}],"repository":{"astro-code-block":{"patterns":[{"contentName":"meta.embedded.block.astro.frontmatter","begin":"^\\s*---\\s*$","end":"^\\s*---\\s*$","patterns":[{"include":"source.tsx"}],"beginCaptures":{"0":{"name":"punctuation.definition.tag.xi.begin.t"}},"endCaptures":{"0":{"name":"punctuation.definition.tag.xi.end.t"}}},{"include":"source.astro"}]}}}
data/grammars/version CHANGED
@@ -1 +1 @@
1
- 7.30.0
1
+ 8.0.0
data/lib/linguist/VERSION CHANGED
@@ -1 +1 @@
1
- 7.30.0
1
+ 8.0.0
@@ -1,8 +1,12 @@
1
1
  require 'linguist/tokenizer'
2
+ require 'set'
2
3
 
3
4
  module Linguist
4
- # Language bayesian classifier.
5
+ # Language content classifier.
5
6
  class Classifier
7
+ # Maximum number of bytes to consider for classification.
8
+ # This is only used at evaluation time. During training, full content of
9
+ # samples is used.
6
10
  CLASSIFIER_CONSIDER_BYTES = 50 * 1024
7
11
 
8
12
  # Public: Use the classifier to detect language of the blob.
@@ -28,41 +32,59 @@ module Linguist
28
32
  #
29
33
  # db - Hash classifier database object
30
34
  # language - String language of data
31
- # data - String contents of file
35
+ # data - String contents of file or array of tokens.
32
36
  #
33
37
  # Examples
34
38
  #
35
- # Classifier.train(db, 'Ruby', "def hello; end")
39
+ # Classifier.train!(db, 'Ruby', "def hello; end")
36
40
  #
37
- # Returns nothing.
41
+ # Returns nil.
38
42
  #
39
- # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
40
- # per-language. See also #dump_all_tokens, below.
43
+ # Set LINGUIST_DEBUG=1, =2 or =3 to print internal statistics.
41
44
  def self.train!(db, language, data)
42
45
  tokens = data
43
46
  tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
44
47
 
45
- counts = Hash.new(0)
46
- tokens.each { |tok| counts[tok] += 1 }
48
+ db['vocabulary'] ||= {}
49
+ # Set hash to autoincremented index value
50
+ if db['vocabulary'].default_proc.nil?
51
+ db['vocabulary'].default_proc = proc do |hash, key|
52
+ hash[key] = hash.length
53
+ end
54
+ end
47
55
 
48
- db['tokens_total'] ||= 0
49
- db['languages_total'] ||= 0
50
- db['tokens'] ||= {}
51
- db['language_tokens'] ||= {}
52
- db['languages'] ||= {}
56
+ db['samples'] ||= {}
57
+ db['samples'][language] ||= []
53
58
 
54
- counts.each do |token, count|
55
- db['tokens'][language] ||= {}
56
- db['tokens'][language][token] ||= 0
57
- db['tokens'][language][token] += count
58
- db['language_tokens'][language] ||= 0
59
- db['language_tokens'][language] += count
60
- db['tokens_total'] += count
61
- end
62
- db['languages'][language] ||= 0
63
- db['languages'][language] += 1
64
- db['languages_total'] += 1
59
+ termfreq = to_vocabulary_index_termfreq(db['vocabulary'], tokens)
60
+ db['samples'][language] << termfreq
61
+
62
+ nil
63
+ end
64
+
65
+ # Public: Finalize training.
66
+ #
67
+ # db - Hash classifier database object
68
+ #
69
+ # Examples:
70
+ # Classifier.finalize_train!(db)
71
+ #
72
+ # Returns nil.
73
+ #
74
+ # This method must be called after the last #train! call.
75
+ def self.finalize_train!(db)
76
+ db['vocabulary'] ||= {}
77
+
78
+ # Unset hash autoincrement
79
+ db['vocabulary'].default_proc = nil
65
80
 
81
+ db['samples'] ||= []
82
+ filter_vocab_by_freq! db, MIN_DOCUMENT_FREQUENCY
83
+ sort_vocab! db
84
+ db['icf'] = inverse_class_freqs db
85
+ normalize_samples! db
86
+ db['centroids'] = get_centroids db
87
+ db.delete 'samples'
66
88
  nil
67
89
  end
68
90
 
@@ -78,20 +100,17 @@ module Linguist
78
100
  # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
79
101
  #
80
102
  # Returns sorted Array of result pairs. Each pair contains the
81
- # String language name and a Float score.
103
+ # String language name and a Float score between 0.0 and 1.0.
82
104
  def self.classify(db, tokens, languages = nil)
83
- languages ||= db['languages'].keys
105
+ languages ||= db['centroids'].keys
84
106
  new(db).classify(tokens, languages)
85
107
  end
86
108
 
87
109
  # Internal: Initialize a Classifier.
88
110
  def initialize(db = {})
89
- @tokens_total = db['tokens_total']
90
- @languages_total = db['languages_total']
91
- @tokens = db['tokens']
92
- @language_tokens = db['language_tokens']
93
- @languages = db['languages']
94
- @unknown_logprob = Math.log(1 / db['tokens_total'].to_f)
111
+ @vocabulary = db['vocabulary']
112
+ @centroids = db['centroids']
113
+ @icf = db['icf']
95
114
  end
96
115
 
97
116
  # Internal: Guess language of data
@@ -100,72 +119,70 @@ module Linguist
100
119
  # languages - Array of language name Strings to restrict to.
101
120
  #
102
121
  # Returns sorted Array of result pairs. Each pair contains the
103
- # String language name and a Float score.
122
+ # String language name and a Float score between 0.0 and 1.0.
104
123
  def classify(tokens, languages)
105
124
  return [] if tokens.nil? || languages.empty?
106
125
  tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
107
- scores = {}
108
126
 
109
- debug_dump_all_tokens(tokens, languages) if verbosity >= 2
127
+ debug_dump_tokens(tokens) if verbosity >= 3
110
128
 
111
- counts = Hash.new(0)
112
- tokens.each { |tok| counts[tok] += 1 }
129
+ vec = Classifier.to_vocabulary_index_termfreq_gaps(@vocabulary, tokens)
130
+ vec.each do |idx, freq|
131
+ tf = 1.0 + Math.log(freq)
132
+ vec[idx] = tf * @icf[idx]
133
+ end
134
+ return [] if vec.empty?
135
+ Classifier.l2_normalize!(vec)
113
136
 
137
+ scores = {}
114
138
  languages.each do |language|
115
- scores[language] = tokens_probability(counts, language) + language_probability(language)
116
- debug_dump_probabilities(counts, language, scores[language]) if verbosity >= 1
139
+ centroid = @centroids[language]
140
+ score = Classifier.similarity(vec, centroid)
141
+ if score > 0.0
142
+ scores[language] = score
143
+ end
117
144
  end
118
-
119
- scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
145
+ scores = scores.sort_by { |x| -x[1] }
146
+ debug_dump_all_tokens(tokens, scores) if verbosity >= 2
147
+ debug_dump_scores(scores) if verbosity >= 1
148
+ scores
120
149
  end
121
150
 
122
- # Internal: Probably of set of tokens in a language occurring - P(D | C)
123
- #
124
- # tokens - Array of String tokens.
125
- # language - Language to check.
126
- #
127
- # Returns Float between 0.0 and 1.0.
128
- def tokens_probability(counts, language)
129
- sum = 0
130
- counts.each do |token, count|
131
- sum += count * token_probability(token, language)
151
+ private
152
+ MIN_DOCUMENT_FREQUENCY = 2
153
+
154
+ def verbosity
155
+ @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
132
156
  end
133
- sum
134
- end
135
157
 
136
- # Internal: Log-probability of token in language occurring - P(F | C)
137
- #
138
- # token - String token.
139
- # language - Language to check.
140
- #
141
- # Returns Float.
142
- def token_probability(token, language)
143
- count = @tokens[language][token]
144
- if count.nil? || count == 0
145
- # This is usually the most common case, so we cache the result.
146
- @unknown_logprob
147
- else
148
- Math.log(count.to_f / @language_tokens[language].to_f)
158
+ def debug_dump_scores(scores)
159
+ headers = ["Language", "Score"]
160
+ rows = scores.map { |l, s| [l, "%.3f" % s] }
161
+ dump_table(headers, rows)
149
162
  end
150
- end
151
163
 
152
- # Internal: Probably of a language occurring - P(C)
153
- #
154
- # language - Language to check.
155
- #
156
- # Returns Float between 0.0 and 1.0.
157
- def language_probability(language)
158
- Math.log(@languages[language].to_f / @languages_total.to_f)
159
- end
164
+ def debug_dump_tokens(tokens)
165
+ counts = Hash.new(0)
166
+ tokens.each do |tok|
167
+ idx = @vocabulary[tok]
168
+ if not idx.nil?
169
+ counts[tok] += 1
170
+ end
171
+ end
160
172
 
161
- private
162
- def verbosity
163
- @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
164
- end
173
+ norm = Classifier.l2_norm(counts)
174
+ rows = counts.map do |tok, tf|
175
+ idx = @vocabulary[tok]
176
+ log_tf = 1.0 + Math.log(tf)
177
+ with_icf = log_tf * @icf[idx]
178
+ normalized = with_icf / norm
179
+ row = [tok, tf, "%.3f" % log_tf, "%.3f" % with_icf, "%.3f" % normalized]
180
+ [normalized, row]
181
+ end
165
182
 
166
- def debug_dump_probabilities(tokens, language, score)
167
- printf("%10s = %10.3f + %7.3f = %10.3f\n",
168
- language, tokens_probability(tokens, language), language_probability(language), score)
183
+ headers = ["Token", "TF", "log", "*ICF", "L2"]
184
+ rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
185
+ dump_table(headers, rows)
169
186
  end
170
187
 
171
188
  # Internal: show a table of probabilities for each <token,language> pair.
@@ -173,31 +190,223 @@ module Linguist
173
190
  # The number in each table entry is the number of "points" that each
174
191
  # token contributes toward the belief that the file under test is a
175
192
  # particular language. Points are additive.
176
- #
177
- # Points are the number of times a token appears in the file, times
178
- # how much more likely (log of probability ratio) that token is to
179
- # appear in one language vs. the least-likely language. Dashes
180
- # indicate the least-likely language (and zero points) for each token.
181
- def debug_dump_all_tokens(tokens, languages)
182
- maxlen = tokens.map { |tok| tok.size }.max
183
-
184
- printf "%#{maxlen}s", ""
185
- puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
186
-
187
- token_map = Hash.new(0)
188
- tokens.each { |tok| token_map[tok] += 1 }
189
-
190
- token_map.sort.each { |tok, count|
191
- arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
192
- min = arr.map { |a,b| b }.min
193
- if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
194
- printf "%#{maxlen}s%5d", tok, count
195
-
196
- puts arr.map { |ent|
197
- ent[1] == min ? " -" : sprintf("%10.3f", count * (ent[1] - min))
198
- }.join
193
+ def debug_dump_all_tokens(tokens, scores)
194
+ languages = scores.map { |l, _| l }
195
+
196
+ counts = Hash.new(0)
197
+ tokens.each do |tok|
198
+ idx = @vocabulary[tok]
199
+ if not idx.nil?
200
+ counts[tok] += 1
201
+ end
202
+ end
203
+
204
+ data = {}
205
+ norm = Classifier.l2_norm(counts)
206
+ languages.each do |language|
207
+ data[language] = {}
208
+ counts.each do |tok, tf|
209
+ idx = @vocabulary[tok]
210
+ log_tf = 1.0 + Math.log(tf)
211
+ with_icf = log_tf * @icf[idx]
212
+ normalized = with_icf / norm
213
+ data[language][tok] = normalized * @centroids[language][idx].to_f
214
+ end
215
+ end
216
+
217
+ norm = Classifier.l2_norm(counts)
218
+ rows = counts.map do |tok, tf|
219
+ idx = @vocabulary[tok]
220
+ log_tf = 1.0 + Math.log(tf)
221
+ with_icf = log_tf * @icf[idx]
222
+ normalized = with_icf / norm
223
+ scores = languages.map do |l, _|
224
+ [l, data[l][tok].to_f]
225
+ end
226
+ max_score = scores.to_h.values.max
227
+ row = [tok] + scores.map do |l, s|
228
+ if s == max_score
229
+ "%.4f*" % s
230
+ elsif s > 0.0
231
+ "%.4f" % s
232
+ else
233
+ "-"
234
+ end
235
+ end
236
+ [normalized, row]
237
+ end
238
+ headers = ["Token"] + (0..languages.length-1).map { |lidx| "[#{lidx}]" }
239
+ rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
240
+ legend = languages.each_with_index.map { |l, lidx| "[#{lidx}] = #{l}" }
241
+ dump_table(headers, rows, legend)
242
+ end
243
+
244
+ def dump_table(header, rows, legend = nil)
245
+ n_cols = header.length
246
+ rows = rows.map { |r| r.map { |c| c.to_s } }
247
+ col_widths = (0..n_cols - 1).map do |j|
248
+ ([header[j].length] + rows.map { |row| row[j].length }).max
249
+ end
250
+ sep_line = "| #{(0..n_cols-1).map { |j| "-" * col_widths[j] }.join(" | ")} |"
251
+ content_width = sep_line.length - 4
252
+ top_line = "| #{"-" * content_width} |"
253
+
254
+ format_row = Proc.new do |row|
255
+ cells = row.zip(col_widths).map do |cell, width|
256
+ "%-#{width}s" % cell
257
+ end
258
+ "| %s |" % cells.join(" | ")
259
+ end
260
+
261
+ puts top_line
262
+ puts format_row.call(header)
263
+ puts sep_line
264
+ rows.each do |row|
265
+ puts format_row.call(row)
266
+ end
267
+ puts top_line
268
+ if legend
269
+ legend.each do |line|
270
+ puts "| %-#{content_width}s |" % line
271
+ end
272
+ puts top_line
273
+ end
274
+ end
275
+
276
+ def self.to_vocabulary_index_termfreq(vocab, tokens)
277
+ counts = Hash.new(0)
278
+ tokens.each do |key|
279
+ idx = vocab[key]
280
+ counts[idx] += 1
281
+ end
282
+ counts
283
+ end
284
+
285
+ def self.to_vocabulary_index_termfreq_gaps(vocab, tokens)
286
+ counts = Hash.new(0)
287
+ tokens.each do |key|
288
+ if vocab.key? key
289
+ idx = vocab[key]
290
+ counts[idx] += 1
291
+ end
292
+ end
293
+ counts
294
+ end
295
+
296
+ def self.l2_norm(vec)
297
+ norm = vec.values.inject(0.0) { |sum, x| sum + x**2 }
298
+ Math.sqrt(norm)
299
+ end
300
+
301
+ def self.l2_normalize!(vec)
302
+ norm = l2_norm(vec)
303
+ vec.transform_values! do |value|
304
+ value.to_f / norm
305
+ end
306
+ nil
307
+ end
308
+
309
+ def self.similarity(a, b)
310
+ sum = 0.0
311
+ a.each_key do |idx|
312
+ if b.key? idx
313
+ sum += a[idx] * b[idx]
199
314
  end
200
- }
315
+ end
316
+ sum
201
317
  end
318
+
319
+ # Filter vocabulary by minimum document frequency.
320
+ def self.filter_vocab_by_freq!(db, min_freq)
321
+ vocabulary = db['vocabulary']
322
+
323
+ # Get document frequencies
324
+ docfreq = Array.new(vocabulary.size, 0)
325
+ db['samples'].each_value do |samples|
326
+ samples.each do |sample|
327
+ sample.each_key do |idx|
328
+ docfreq[idx] += 1
329
+ end
330
+ end
331
+ end
332
+
333
+ vocabulary.select! do |_, idx|
334
+ docfreq[idx] >= min_freq
335
+ end
336
+
337
+ nil
338
+ end
339
+
340
+ # Sort vocabulary lexicographically.
341
+ def self.sort_vocab!(db)
342
+ new_indices = Hash.new { |h,k| h[k] = h.length }
343
+ db['vocabulary'].sort_by { |x| x[0] }.each do |term, idx|
344
+ db['vocabulary'][term] = new_indices[idx]
345
+ end
346
+ new_indices.default_proc = nil
347
+
348
+ db['samples'].transform_values! do |samples|
349
+ samples.map do |sample|
350
+ new_sample = {}
351
+ sample.each do |idx, freq|
352
+ new_idx = new_indices[idx]
353
+ if not new_idx.nil?
354
+ new_sample[new_idx] = freq
355
+ end
356
+ end
357
+ new_sample
358
+ end
359
+ end
360
+ end
361
+
362
+ # Compute inverse class frequency (ICF) for every term.
363
+ def self.inverse_class_freqs(db)
364
+ icf = Array.new(db['vocabulary'].size, 0)
365
+ db['samples'].each_value do |samples|
366
+ terms = Set.new
367
+ samples.each do |sample|
368
+ terms |= sample.keys
369
+ end
370
+ terms.each do |idx|
371
+ icf[idx] += 1
372
+ end
373
+ end
374
+ icf.map! do |val|
375
+ Math.log(db['samples'].size.to_f / val.to_f) + 1
376
+ end
377
+ icf
378
+ end
379
+
380
+ def self.normalize_samples!(db)
381
+ icf = db['icf']
382
+ db['samples'].each_value do |samples|
383
+ samples.each do |sample|
384
+ sample.each do |idx, freq|
385
+ tf = 1.0 + Math.log(freq)
386
+ sample[idx] = tf * icf[idx]
387
+ end
388
+ l2_normalize! sample
389
+ end
390
+ end
391
+ end
392
+
393
+ def self.get_centroids(db)
394
+ centroids = {}
395
+ db['samples'].each do |language, samples|
396
+ centroid = Hash.new(0.0)
397
+ samples.each do |sample|
398
+ sample.each do |idx, val|
399
+ centroid[idx] += val
400
+ end
401
+ end
402
+ centroid.each_key do |idx|
403
+ centroid[idx] = centroid[idx] / samples.length
404
+ end
405
+ l2_normalize! centroid
406
+ centroids[language] = centroid
407
+ end
408
+ centroids
409
+ end
410
+
202
411
  end
203
412
  end
@@ -61,6 +61,7 @@ module Linguist
61
61
  composer_lock? ||
62
62
  cargo_lock? ||
63
63
  cargo_orig? ||
64
+ deno_lock? ||
64
65
  flake_lock? ||
65
66
  bazel_lock? ||
66
67
  node_modules? ||
@@ -68,6 +69,7 @@ module Linguist
68
69
  go_lock? ||
69
70
  poetry_lock? ||
70
71
  pdm_lock? ||
72
+ uv_lock? ||
71
73
  esy_lock? ||
72
74
  npm_shrinkwrap_or_package_lock? ||
73
75
  pnpm_lock? ||
@@ -422,6 +424,13 @@ module Linguist
422
424
  !!name.match(/pdm\.lock/)
423
425
  end
424
426
 
427
+ # Internal: Is the blob a generated uv.lock?
428
+ #
429
+ # Returns true or false.
430
+ def uv_lock?
431
+ !!name.match(/uv\.lock/)
432
+ end
433
+
425
434
  # Internal: Is the blob a generated esy lock file?
426
435
  #
427
436
  # Returns true or false.
@@ -429,6 +438,13 @@ module Linguist
429
438
  !!name.match(/(^|\/)(\w+\.)?esy.lock$/)
430
439
  end
431
440
 
441
+ # Internal: Is the blob a generated deno lockfile, which are not meant for humans in pull requests.
442
+ #
443
+ # Returns true or false.
444
+ def deno_lock?
445
+ !!name.match(/deno\.lock/)
446
+ end
447
+
432
448
  # Internal: Is the blob a generated npm shrinkwrap or package lock file?
433
449
  #
434
450
  # Returns true or false.
@@ -697,14 +713,11 @@ module Linguist
697
713
 
698
714
  # Internal: Is this a generated Game Maker Studio (2) metadata file?
699
715
  #
700
- # All Game Maker Studio 2 generated files will be JSON, .yy or .yyp, and have
701
- # a part that looks like "modelName: GMname" on the 3rd line
702
- #
703
716
  # Return true or false
704
717
  def generated_gamemakerstudio?
705
718
  return false unless ['.yy', '.yyp'].include? extname
706
719
  return false unless lines.count > 3
707
- return lines[2].match(/\"modelName\"\:\s*\"GM/) ||
720
+ return lines.first(3).join('').match?(/^\s*[\{\[]/) ||
708
721
  lines[0] =~ /^\d\.\d\.\d.+\|\{/
709
722
  end
710
723
 
@@ -16,6 +16,7 @@ extensions:
16
16
  - ".9"
17
17
  - ".app"
18
18
  - ".cmp"
19
+ - ".resource"
19
20
  - ".sol"
20
21
  - ".stl"
21
22
  - ".tag"
@@ -126,7 +126,7 @@ module Linguist
126
126
  # Internal: Perform the heuristic
127
127
  def call(data)
128
128
  matched = @rules.find do |rule|
129
- rule['pattern'].match(data)
129
+ rule['pattern'].match?(data)
130
130
  end
131
131
  if !matched.nil?
132
132
  languages = matched['language']
@@ -145,14 +145,14 @@ module Linguist
145
145
  @pats = pats
146
146
  end
147
147
 
148
- def match(input)
149
- return !@pats.any? { |pat| !pat.match(input) }
148
+ def match?(input)
149
+ return @pats.all? { |pat| pat.match?(input) }
150
150
  end
151
151
 
152
152
  end
153
153
 
154
154
  class AlwaysMatch
155
- def match(input)
155
+ def match?(input)
156
156
  return true
157
157
  end
158
158
  end
@@ -163,8 +163,8 @@ module Linguist
163
163
  @pat = pat
164
164
  end
165
165
 
166
- def match(input)
167
- return !@pat.match(input)
166
+ def match?(input)
167
+ return !@pat.match?(input)
168
168
  end
169
169
 
170
170
  end