github-linguist 7.29.0 → 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/git-linguist +7 -3
- data/bin/github-linguist +6 -3
- data/grammars/etc.json +1 -1
- data/grammars/inline.edgeql.json +1 -1
- data/grammars/inline.graphql.json +1 -1
- data/grammars/inline.peggy.json +1 -0
- data/grammars/markdown.gleam.codeblock.json +1 -0
- data/grammars/markdown.move.codeblock.json +1 -1
- data/grammars/markdown.rego.codeblock.json +1 -0
- data/grammars/mdx.move.codeblock.json +1 -0
- data/grammars/source.8xp.json +1 -1
- data/grammars/source.Caddyfile-test.json +1 -0
- data/grammars/source.Caddyfile.json +1 -0
- data/grammars/source.abl.json +1 -1
- data/grammars/source.astro.json +1 -1
- data/grammars/source.bicep.json +1 -1
- data/grammars/source.bms.json +1 -1
- data/grammars/source.bqn.json +1 -0
- data/grammars/source.brs.json +1 -1
- data/grammars/source.cairo.json +1 -1
- data/grammars/source.cairo0.json +1 -0
- data/grammars/source.cl.json +1 -1
- data/grammars/source.clar.json +1 -1
- data/grammars/source.cmd.json +1 -1
- data/grammars/source.cobol.json +1 -1
- data/grammars/source.commonlisp.json +1 -1
- data/grammars/source.cs.json +1 -1
- data/grammars/source.curlrc.json +1 -1
- data/grammars/source.curry.json +1 -1
- data/grammars/source.cylc.json +1 -0
- data/grammars/source.d2.json +1 -1
- data/grammars/source.dart.json +1 -1
- data/grammars/source.dds.dspf.json +1 -1
- data/grammars/source.dds.icff.json +1 -1
- data/grammars/source.dds.lf.json +1 -1
- data/grammars/source.dds.pf.json +1 -1
- data/grammars/source.dds.prtf.json +1 -1
- data/grammars/source.dune.json +1 -0
- data/grammars/source.elvish.json +1 -1
- data/grammars/source.firrtl.json +1 -0
- data/grammars/source.fsharp.json +1 -1
- data/grammars/source.gdscript.json +1 -1
- data/grammars/source.generic-db.json +1 -1
- data/grammars/source.gitconfig.json +1 -1
- data/grammars/source.gjs.json +1 -1
- data/grammars/source.gleam.json +1 -1
- data/grammars/source.gts.json +1 -1
- data/grammars/source.hcl.json +1 -1
- data/grammars/source.hcl.terraform.json +1 -1
- data/grammars/source.hgignore.json +1 -1
- data/grammars/source.hosts.json +1 -1
- data/grammars/source.hx.json +1 -1
- data/grammars/source.iCalendar.json +1 -0
- data/grammars/source.ice.json +1 -1
- data/grammars/source.julia.json +1 -1
- data/grammars/source.just.json +1 -1
- data/grammars/source.kotlin.json +1 -1
- data/grammars/source.lcb.json +1 -0
- data/grammars/source.lilypond.json +1 -1
- data/grammars/source.livecodescript.json +1 -0
- data/grammars/source.lua.json +1 -1
- data/grammars/source.luau.json +1 -0
- data/grammars/source.m2.json +1 -1
- data/grammars/source.markdown.caddy.codeblock.json +1 -0
- data/grammars/source.matlab.json +1 -1
- data/grammars/source.mcfunction.json +1 -1
- data/grammars/source.mdx.json +1 -1
- data/grammars/source.mo.json +1 -1
- data/grammars/source.mojo.json +1 -1
- data/grammars/source.move.json +1 -1
- data/grammars/source.nanorc.json +1 -1
- data/grammars/source.nim.json +1 -1
- data/grammars/source.nr.json +1 -0
- data/grammars/source.nushell.json +1 -1
- data/grammars/source.odin.json +1 -1
- data/grammars/source.p4.json +1 -1
- data/grammars/source.peggy.json +1 -0
- data/grammars/source.pkl.json +1 -0
- data/grammars/source.polar.json +1 -1
- data/grammars/source.powerbuilder.json +1 -0
- data/grammars/source.qsharp.json +1 -1
- data/grammars/source.rascal.json +1 -1
- data/grammars/source.rego.json +1 -1
- data/grammars/source.rescript.json +1 -1
- data/grammars/source.ron.json +1 -0
- data/grammars/source.rpgle.json +1 -1
- data/grammars/source.rust.json +1 -1
- data/grammars/source.sentinel.json +1 -1
- data/grammars/source.solidity.json +1 -1
- data/grammars/source.sourcepawn.json +1 -1
- data/grammars/source.sqf.json +1 -1
- data/grammars/source.stan.json +1 -1
- data/grammars/source.swift.json +1 -1
- data/grammars/source.sy.json +1 -1
- data/grammars/source.templ.json +1 -0
- data/grammars/source.vba.json +1 -1
- data/grammars/source.vcard.json +1 -0
- data/grammars/source.wdl.json +1 -1
- data/grammars/source.wsd.json +1 -1
- data/grammars/text.adblock.json +1 -1
- data/grammars/text.crontab.json +1 -0
- data/grammars/text.html.jte.json +1 -0
- data/grammars/text.html.statamic.json +1 -1
- data/grammars/text.md.json +1 -1
- data/grammars/text.mdx.astro.codeblock.json +1 -0
- data/grammars/text.valve-cfg.json +1 -1
- data/grammars/version +1 -1
- data/lib/linguist/VERSION +1 -1
- data/lib/linguist/classifier.rb +315 -106
- data/lib/linguist/generated.rb +33 -4
- data/lib/linguist/generic.yml +1 -0
- data/lib/linguist/heuristics.rb +6 -6
- data/lib/linguist/heuristics.yml +63 -4
- data/lib/linguist/languages.json +1 -1
- data/lib/linguist/languages.yml +224 -4
- data/lib/linguist/repository.rb +8 -6
- data/lib/linguist/samples.json +1 -1
- data/lib/linguist/samples.rb +9 -1
- data/lib/linguist/sha256.rb +1 -1
- metadata +29 -7
- data/grammars/inline.graphql.rb.json +0 -1
- data/grammars/markdown.mcfunction.codeblock.json +0 -1
- data/grammars/mdx.LANGUAGE.codeblock.json +0 -1
- data/grammars/source.terraform.json +0 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"scopeName":"text.mdx.astro.codeblock","patterns":[{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(`{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r`])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}},{"name":"markup.code.astro.mdx","contentName":"meta.embedded.astro","begin":"(?:^|\\G)[\\t ]*(~{3,})(?:[\\t ]*((?i:(?:.*\\.)?astro))(?:[\\t ]+((?:[^\\n\\r])+))?)(?:[\\t ]*$)","end":"(\\1)(?:[\\t ]*$)","patterns":[{"include":"#astro-code-block"}],"beginCaptures":{"1":{"name":"string.other.begin.code.fenced.mdx"},"2":{"name":"entity.name.function.mdx"}},"endCaptures":{"1":{"name":"string.other.end.code.fenced.mdx"}}}],"repository":{"astro-code-block":{"patterns":[{"contentName":"meta.embedded.block.astro.frontmatter","begin":"^\\s*---\\s*$","end":"^\\s*---\\s*$","patterns":[{"include":"source.tsx"}],"beginCaptures":{"0":{"name":"punctuation.definition.tag.xi.begin.t"}},"endCaptures":{"0":{"name":"punctuation.definition.tag.xi.end.t"}}},{"include":"source.astro"}]}}}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"scopeName":"text.valve-cfg","patterns":[{"include":"#comments"},{"include":"#function"},{"include":"#cvar"},{"include":"#value"}],"repository":{"comments":{"patterns":[{"name":"comment.valve-cfg","match":"\\/\\/.*"},{"name":"comment.block.valve-cfg","begin":"/\\*","end":"\\*/","captures":{"0":{"name":"comment.valve-cfg"}}}]},"cvar":{"patterns":[{"match":"^\\s*(\"[A-Za-z0-9_-]*\")","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s*('[A-Za-z0-9_-]*')","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s*([A-Za-z0-9_-]*)","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}}]},"function":{"patterns":[{"name":"support.function.valve-cfg","match":"^\\s*\\b(alias|bind_osx|bind|clear|echo|execifexists|execwithwhitelist|exec|host_writeconfig_ss|host_writeconfig|key_updatelayout|playvol|say_team|say|unbindalljoystick|unbindallmousekeyboard|unbindall)","captures":{"1":{"name":"support.function.valve-cfg"}}}]},"numeric-literal":{"patterns":[{"name":"constant.numeric.float.
|
|
1
|
+
{"scopeName":"text.valve-cfg","patterns":[{"include":"#comments"},{"include":"#function"},{"include":"#cvar"},{"include":"#value"}],"repository":{"comments":{"patterns":[{"name":"comment.valve-cfg","match":"\\/\\/.*"},{"name":"comment.block.valve-cfg","begin":"/\\*","end":"\\*/","captures":{"0":{"name":"comment.valve-cfg"}}}]},"cvar":{"patterns":[{"match":"^\\s*(\"[A-Za-z0-9_-]*\")","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s*('[A-Za-z0-9_-]*')","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}},{"match":"^\\s*([A-Za-z0-9_-]*)","captures":{"1":{"name":"support.type.property-name.valve-cfg"}}}]},"function":{"patterns":[{"name":"support.function.valve-cfg","match":"^\\s*\\b(alias|bind_osx|bind|clear|echo|execifexists|execwithwhitelist|exec|host_writeconfig_ss|host_writeconfig|key_updatelayout|playvol|say_team|say|unbindalljoystick|unbindallmousekeyboard|unbindall)","captures":{"1":{"name":"support.function.valve-cfg"}}}]},"numeric-literal":{"patterns":[{"name":"constant.numeric.float.valve-cfg","match":"[0-9]+\\.[0-9]+"},{"name":"constant.numeric.valve-cfg","match":"\\b0b[0-1]+\\b"},{"name":"constant.numeric.valve-cfg","match":"\\b0o[0-7]+\\b"},{"name":"constant.numeric.valve-cfg","match":"\\b0x[0-9a-fA-F]+\\b"},{"name":"constant.numeric.integer.valve-cfg","match":"\\b\\d+\\b"},{"name":"invalid.illegal.constant.valve-cfg","match":"\\b\\d+\\w+\\b"}]},"strings":{"name":"string.quoted.double.valve-cfg","begin":"\"","end":"\"","patterns":[{"name":"variable","match":"\\\\."}]},"value":{"begin":".","end":"\\n","patterns":[{"include":"#numeric-literal"},{"include":"#strings"}]}}}
|
data/grammars/version
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
8.0.0
|
data/lib/linguist/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
8.0.0
|
data/lib/linguist/classifier.rb
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
require 'linguist/tokenizer'
|
|
2
|
+
require 'set'
|
|
2
3
|
|
|
3
4
|
module Linguist
|
|
4
|
-
# Language
|
|
5
|
+
# Language content classifier.
|
|
5
6
|
class Classifier
|
|
7
|
+
# Maximum number of bytes to consider for classification.
|
|
8
|
+
# This is only used at evaluation time. During training, full content of
|
|
9
|
+
# samples is used.
|
|
6
10
|
CLASSIFIER_CONSIDER_BYTES = 50 * 1024
|
|
7
11
|
|
|
8
12
|
# Public: Use the classifier to detect language of the blob.
|
|
@@ -28,41 +32,59 @@ module Linguist
|
|
|
28
32
|
#
|
|
29
33
|
# db - Hash classifier database object
|
|
30
34
|
# language - String language of data
|
|
31
|
-
# data - String contents of file
|
|
35
|
+
# data - String contents of file or array of tokens.
|
|
32
36
|
#
|
|
33
37
|
# Examples
|
|
34
38
|
#
|
|
35
|
-
# Classifier.train(db, 'Ruby', "def hello; end")
|
|
39
|
+
# Classifier.train!(db, 'Ruby', "def hello; end")
|
|
36
40
|
#
|
|
37
|
-
# Returns
|
|
41
|
+
# Returns nil.
|
|
38
42
|
#
|
|
39
|
-
# Set LINGUIST_DEBUG=1 or =
|
|
40
|
-
# per-language. See also #dump_all_tokens, below.
|
|
43
|
+
# Set LINGUIST_DEBUG=1, =2 or =3 to print internal statistics.
|
|
41
44
|
def self.train!(db, language, data)
|
|
42
45
|
tokens = data
|
|
43
46
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
|
44
47
|
|
|
45
|
-
|
|
46
|
-
|
|
48
|
+
db['vocabulary'] ||= {}
|
|
49
|
+
# Set hash to autoincremented index value
|
|
50
|
+
if db['vocabulary'].default_proc.nil?
|
|
51
|
+
db['vocabulary'].default_proc = proc do |hash, key|
|
|
52
|
+
hash[key] = hash.length
|
|
53
|
+
end
|
|
54
|
+
end
|
|
47
55
|
|
|
48
|
-
db['
|
|
49
|
-
db['
|
|
50
|
-
db['tokens'] ||= {}
|
|
51
|
-
db['language_tokens'] ||= {}
|
|
52
|
-
db['languages'] ||= {}
|
|
56
|
+
db['samples'] ||= {}
|
|
57
|
+
db['samples'][language] ||= []
|
|
53
58
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
59
|
+
termfreq = to_vocabulary_index_termfreq(db['vocabulary'], tokens)
|
|
60
|
+
db['samples'][language] << termfreq
|
|
61
|
+
|
|
62
|
+
nil
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Public: Finalize training.
|
|
66
|
+
#
|
|
67
|
+
# db - Hash classifier database object
|
|
68
|
+
#
|
|
69
|
+
# Examples:
|
|
70
|
+
# Classifier.finalize_train!(db)
|
|
71
|
+
#
|
|
72
|
+
# Returns nil.
|
|
73
|
+
#
|
|
74
|
+
# This method must be called after the last #train! call.
|
|
75
|
+
def self.finalize_train!(db)
|
|
76
|
+
db['vocabulary'] ||= {}
|
|
77
|
+
|
|
78
|
+
# Unset hash autoincrement
|
|
79
|
+
db['vocabulary'].default_proc = nil
|
|
65
80
|
|
|
81
|
+
db['samples'] ||= []
|
|
82
|
+
filter_vocab_by_freq! db, MIN_DOCUMENT_FREQUENCY
|
|
83
|
+
sort_vocab! db
|
|
84
|
+
db['icf'] = inverse_class_freqs db
|
|
85
|
+
normalize_samples! db
|
|
86
|
+
db['centroids'] = get_centroids db
|
|
87
|
+
db.delete 'samples'
|
|
66
88
|
nil
|
|
67
89
|
end
|
|
68
90
|
|
|
@@ -78,20 +100,17 @@ module Linguist
|
|
|
78
100
|
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
|
79
101
|
#
|
|
80
102
|
# Returns sorted Array of result pairs. Each pair contains the
|
|
81
|
-
# String language name and a Float score.
|
|
103
|
+
# String language name and a Float score between 0.0 and 1.0.
|
|
82
104
|
def self.classify(db, tokens, languages = nil)
|
|
83
|
-
languages ||= db['
|
|
105
|
+
languages ||= db['centroids'].keys
|
|
84
106
|
new(db).classify(tokens, languages)
|
|
85
107
|
end
|
|
86
108
|
|
|
87
109
|
# Internal: Initialize a Classifier.
|
|
88
110
|
def initialize(db = {})
|
|
89
|
-
@
|
|
90
|
-
@
|
|
91
|
-
@
|
|
92
|
-
@language_tokens = db['language_tokens']
|
|
93
|
-
@languages = db['languages']
|
|
94
|
-
@unknown_logprob = Math.log(1 / db['tokens_total'].to_f)
|
|
111
|
+
@vocabulary = db['vocabulary']
|
|
112
|
+
@centroids = db['centroids']
|
|
113
|
+
@icf = db['icf']
|
|
95
114
|
end
|
|
96
115
|
|
|
97
116
|
# Internal: Guess language of data
|
|
@@ -100,72 +119,70 @@ module Linguist
|
|
|
100
119
|
# languages - Array of language name Strings to restrict to.
|
|
101
120
|
#
|
|
102
121
|
# Returns sorted Array of result pairs. Each pair contains the
|
|
103
|
-
# String language name and a Float score.
|
|
122
|
+
# String language name and a Float score between 0.0 and 1.0.
|
|
104
123
|
def classify(tokens, languages)
|
|
105
124
|
return [] if tokens.nil? || languages.empty?
|
|
106
125
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
|
107
|
-
scores = {}
|
|
108
126
|
|
|
109
|
-
|
|
127
|
+
debug_dump_tokens(tokens) if verbosity >= 3
|
|
110
128
|
|
|
111
|
-
|
|
112
|
-
|
|
129
|
+
vec = Classifier.to_vocabulary_index_termfreq_gaps(@vocabulary, tokens)
|
|
130
|
+
vec.each do |idx, freq|
|
|
131
|
+
tf = 1.0 + Math.log(freq)
|
|
132
|
+
vec[idx] = tf * @icf[idx]
|
|
133
|
+
end
|
|
134
|
+
return [] if vec.empty?
|
|
135
|
+
Classifier.l2_normalize!(vec)
|
|
113
136
|
|
|
137
|
+
scores = {}
|
|
114
138
|
languages.each do |language|
|
|
115
|
-
|
|
116
|
-
|
|
139
|
+
centroid = @centroids[language]
|
|
140
|
+
score = Classifier.similarity(vec, centroid)
|
|
141
|
+
if score > 0.0
|
|
142
|
+
scores[language] = score
|
|
143
|
+
end
|
|
117
144
|
end
|
|
118
|
-
|
|
119
|
-
|
|
145
|
+
scores = scores.sort_by { |x| -x[1] }
|
|
146
|
+
debug_dump_all_tokens(tokens, scores) if verbosity >= 2
|
|
147
|
+
debug_dump_scores(scores) if verbosity >= 1
|
|
148
|
+
scores
|
|
120
149
|
end
|
|
121
150
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
# Returns Float between 0.0 and 1.0.
|
|
128
|
-
def tokens_probability(counts, language)
|
|
129
|
-
sum = 0
|
|
130
|
-
counts.each do |token, count|
|
|
131
|
-
sum += count * token_probability(token, language)
|
|
151
|
+
private
|
|
152
|
+
MIN_DOCUMENT_FREQUENCY = 2
|
|
153
|
+
|
|
154
|
+
def verbosity
|
|
155
|
+
@verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
|
|
132
156
|
end
|
|
133
|
-
sum
|
|
134
|
-
end
|
|
135
157
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
#
|
|
141
|
-
# Returns Float.
|
|
142
|
-
def token_probability(token, language)
|
|
143
|
-
count = @tokens[language][token]
|
|
144
|
-
if count.nil? || count == 0
|
|
145
|
-
# This is usually the most common case, so we cache the result.
|
|
146
|
-
@unknown_logprob
|
|
147
|
-
else
|
|
148
|
-
Math.log(count.to_f / @language_tokens[language].to_f)
|
|
158
|
+
def debug_dump_scores(scores)
|
|
159
|
+
headers = ["Language", "Score"]
|
|
160
|
+
rows = scores.map { |l, s| [l, "%.3f" % s] }
|
|
161
|
+
dump_table(headers, rows)
|
|
149
162
|
end
|
|
150
|
-
end
|
|
151
163
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
164
|
+
def debug_dump_tokens(tokens)
|
|
165
|
+
counts = Hash.new(0)
|
|
166
|
+
tokens.each do |tok|
|
|
167
|
+
idx = @vocabulary[tok]
|
|
168
|
+
if not idx.nil?
|
|
169
|
+
counts[tok] += 1
|
|
170
|
+
end
|
|
171
|
+
end
|
|
160
172
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
173
|
+
norm = Classifier.l2_norm(counts)
|
|
174
|
+
rows = counts.map do |tok, tf|
|
|
175
|
+
idx = @vocabulary[tok]
|
|
176
|
+
log_tf = 1.0 + Math.log(tf)
|
|
177
|
+
with_icf = log_tf * @icf[idx]
|
|
178
|
+
normalized = with_icf / norm
|
|
179
|
+
row = [tok, tf, "%.3f" % log_tf, "%.3f" % with_icf, "%.3f" % normalized]
|
|
180
|
+
[normalized, row]
|
|
181
|
+
end
|
|
165
182
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
183
|
+
headers = ["Token", "TF", "log", "*ICF", "L2"]
|
|
184
|
+
rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
|
|
185
|
+
dump_table(headers, rows)
|
|
169
186
|
end
|
|
170
187
|
|
|
171
188
|
# Internal: show a table of probabilities for each <token,language> pair.
|
|
@@ -173,31 +190,223 @@ module Linguist
|
|
|
173
190
|
# The number in each table entry is the number of "points" that each
|
|
174
191
|
# token contributes toward the belief that the file under test is a
|
|
175
192
|
# particular language. Points are additive.
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
193
|
+
def debug_dump_all_tokens(tokens, scores)
|
|
194
|
+
languages = scores.map { |l, _| l }
|
|
195
|
+
|
|
196
|
+
counts = Hash.new(0)
|
|
197
|
+
tokens.each do |tok|
|
|
198
|
+
idx = @vocabulary[tok]
|
|
199
|
+
if not idx.nil?
|
|
200
|
+
counts[tok] += 1
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
data = {}
|
|
205
|
+
norm = Classifier.l2_norm(counts)
|
|
206
|
+
languages.each do |language|
|
|
207
|
+
data[language] = {}
|
|
208
|
+
counts.each do |tok, tf|
|
|
209
|
+
idx = @vocabulary[tok]
|
|
210
|
+
log_tf = 1.0 + Math.log(tf)
|
|
211
|
+
with_icf = log_tf * @icf[idx]
|
|
212
|
+
normalized = with_icf / norm
|
|
213
|
+
data[language][tok] = normalized * @centroids[language][idx].to_f
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
norm = Classifier.l2_norm(counts)
|
|
218
|
+
rows = counts.map do |tok, tf|
|
|
219
|
+
idx = @vocabulary[tok]
|
|
220
|
+
log_tf = 1.0 + Math.log(tf)
|
|
221
|
+
with_icf = log_tf * @icf[idx]
|
|
222
|
+
normalized = with_icf / norm
|
|
223
|
+
scores = languages.map do |l, _|
|
|
224
|
+
[l, data[l][tok].to_f]
|
|
225
|
+
end
|
|
226
|
+
max_score = scores.to_h.values.max
|
|
227
|
+
row = [tok] + scores.map do |l, s|
|
|
228
|
+
if s == max_score
|
|
229
|
+
"%.4f*" % s
|
|
230
|
+
elsif s > 0.0
|
|
231
|
+
"%.4f" % s
|
|
232
|
+
else
|
|
233
|
+
"-"
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
[normalized, row]
|
|
237
|
+
end
|
|
238
|
+
headers = ["Token"] + (0..languages.length-1).map { |lidx| "[#{lidx}]" }
|
|
239
|
+
rows = rows.sort_by { |x| -x[0] }.map { |_, row| row }
|
|
240
|
+
legend = languages.each_with_index.map { |l, lidx| "[#{lidx}] = #{l}" }
|
|
241
|
+
dump_table(headers, rows, legend)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def dump_table(header, rows, legend = nil)
|
|
245
|
+
n_cols = header.length
|
|
246
|
+
rows = rows.map { |r| r.map { |c| c.to_s } }
|
|
247
|
+
col_widths = (0..n_cols - 1).map do |j|
|
|
248
|
+
([header[j].length] + rows.map { |row| row[j].length }).max
|
|
249
|
+
end
|
|
250
|
+
sep_line = "| #{(0..n_cols-1).map { |j| "-" * col_widths[j] }.join(" | ")} |"
|
|
251
|
+
content_width = sep_line.length - 4
|
|
252
|
+
top_line = "| #{"-" * content_width} |"
|
|
253
|
+
|
|
254
|
+
format_row = Proc.new do |row|
|
|
255
|
+
cells = row.zip(col_widths).map do |cell, width|
|
|
256
|
+
"%-#{width}s" % cell
|
|
257
|
+
end
|
|
258
|
+
"| %s |" % cells.join(" | ")
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
puts top_line
|
|
262
|
+
puts format_row.call(header)
|
|
263
|
+
puts sep_line
|
|
264
|
+
rows.each do |row|
|
|
265
|
+
puts format_row.call(row)
|
|
266
|
+
end
|
|
267
|
+
puts top_line
|
|
268
|
+
if legend
|
|
269
|
+
legend.each do |line|
|
|
270
|
+
puts "| %-#{content_width}s |" % line
|
|
271
|
+
end
|
|
272
|
+
puts top_line
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
def self.to_vocabulary_index_termfreq(vocab, tokens)
|
|
277
|
+
counts = Hash.new(0)
|
|
278
|
+
tokens.each do |key|
|
|
279
|
+
idx = vocab[key]
|
|
280
|
+
counts[idx] += 1
|
|
281
|
+
end
|
|
282
|
+
counts
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def self.to_vocabulary_index_termfreq_gaps(vocab, tokens)
|
|
286
|
+
counts = Hash.new(0)
|
|
287
|
+
tokens.each do |key|
|
|
288
|
+
if vocab.key? key
|
|
289
|
+
idx = vocab[key]
|
|
290
|
+
counts[idx] += 1
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
counts
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
def self.l2_norm(vec)
|
|
297
|
+
norm = vec.values.inject(0.0) { |sum, x| sum + x**2 }
|
|
298
|
+
Math.sqrt(norm)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def self.l2_normalize!(vec)
|
|
302
|
+
norm = l2_norm(vec)
|
|
303
|
+
vec.transform_values! do |value|
|
|
304
|
+
value.to_f / norm
|
|
305
|
+
end
|
|
306
|
+
nil
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def self.similarity(a, b)
|
|
310
|
+
sum = 0.0
|
|
311
|
+
a.each_key do |idx|
|
|
312
|
+
if b.key? idx
|
|
313
|
+
sum += a[idx] * b[idx]
|
|
199
314
|
end
|
|
200
|
-
|
|
315
|
+
end
|
|
316
|
+
sum
|
|
201
317
|
end
|
|
318
|
+
|
|
319
|
+
# Filter vocabulary by minimum document frequency.
|
|
320
|
+
def self.filter_vocab_by_freq!(db, min_freq)
|
|
321
|
+
vocabulary = db['vocabulary']
|
|
322
|
+
|
|
323
|
+
# Get document frequencies
|
|
324
|
+
docfreq = Array.new(vocabulary.size, 0)
|
|
325
|
+
db['samples'].each_value do |samples|
|
|
326
|
+
samples.each do |sample|
|
|
327
|
+
sample.each_key do |idx|
|
|
328
|
+
docfreq[idx] += 1
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
vocabulary.select! do |_, idx|
|
|
334
|
+
docfreq[idx] >= min_freq
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
nil
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Sort vocabulary lexicographically.
|
|
341
|
+
def self.sort_vocab!(db)
|
|
342
|
+
new_indices = Hash.new { |h,k| h[k] = h.length }
|
|
343
|
+
db['vocabulary'].sort_by { |x| x[0] }.each do |term, idx|
|
|
344
|
+
db['vocabulary'][term] = new_indices[idx]
|
|
345
|
+
end
|
|
346
|
+
new_indices.default_proc = nil
|
|
347
|
+
|
|
348
|
+
db['samples'].transform_values! do |samples|
|
|
349
|
+
samples.map do |sample|
|
|
350
|
+
new_sample = {}
|
|
351
|
+
sample.each do |idx, freq|
|
|
352
|
+
new_idx = new_indices[idx]
|
|
353
|
+
if not new_idx.nil?
|
|
354
|
+
new_sample[new_idx] = freq
|
|
355
|
+
end
|
|
356
|
+
end
|
|
357
|
+
new_sample
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# Compute inverse class frequency (ICF) for every term.
|
|
363
|
+
def self.inverse_class_freqs(db)
|
|
364
|
+
icf = Array.new(db['vocabulary'].size, 0)
|
|
365
|
+
db['samples'].each_value do |samples|
|
|
366
|
+
terms = Set.new
|
|
367
|
+
samples.each do |sample|
|
|
368
|
+
terms |= sample.keys
|
|
369
|
+
end
|
|
370
|
+
terms.each do |idx|
|
|
371
|
+
icf[idx] += 1
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
icf.map! do |val|
|
|
375
|
+
Math.log(db['samples'].size.to_f / val.to_f) + 1
|
|
376
|
+
end
|
|
377
|
+
icf
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def self.normalize_samples!(db)
|
|
381
|
+
icf = db['icf']
|
|
382
|
+
db['samples'].each_value do |samples|
|
|
383
|
+
samples.each do |sample|
|
|
384
|
+
sample.each do |idx, freq|
|
|
385
|
+
tf = 1.0 + Math.log(freq)
|
|
386
|
+
sample[idx] = tf * icf[idx]
|
|
387
|
+
end
|
|
388
|
+
l2_normalize! sample
|
|
389
|
+
end
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
def self.get_centroids(db)
|
|
394
|
+
centroids = {}
|
|
395
|
+
db['samples'].each do |language, samples|
|
|
396
|
+
centroid = Hash.new(0.0)
|
|
397
|
+
samples.each do |sample|
|
|
398
|
+
sample.each do |idx, val|
|
|
399
|
+
centroid[idx] += val
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
centroid.each_key do |idx|
|
|
403
|
+
centroid[idx] = centroid[idx] / samples.length
|
|
404
|
+
end
|
|
405
|
+
l2_normalize! centroid
|
|
406
|
+
centroids[language] = centroid
|
|
407
|
+
end
|
|
408
|
+
centroids
|
|
409
|
+
end
|
|
410
|
+
|
|
202
411
|
end
|
|
203
412
|
end
|
data/lib/linguist/generated.rb
CHANGED
|
@@ -60,12 +60,16 @@ module Linguist
|
|
|
60
60
|
generated_net_specflow_feature_file? ||
|
|
61
61
|
composer_lock? ||
|
|
62
62
|
cargo_lock? ||
|
|
63
|
+
cargo_orig? ||
|
|
64
|
+
deno_lock? ||
|
|
63
65
|
flake_lock? ||
|
|
66
|
+
bazel_lock? ||
|
|
64
67
|
node_modules? ||
|
|
65
68
|
go_vendor? ||
|
|
66
69
|
go_lock? ||
|
|
67
70
|
poetry_lock? ||
|
|
68
71
|
pdm_lock? ||
|
|
72
|
+
uv_lock? ||
|
|
69
73
|
esy_lock? ||
|
|
70
74
|
npm_shrinkwrap_or_package_lock? ||
|
|
71
75
|
pnpm_lock? ||
|
|
@@ -420,6 +424,13 @@ module Linguist
|
|
|
420
424
|
!!name.match(/pdm\.lock/)
|
|
421
425
|
end
|
|
422
426
|
|
|
427
|
+
# Internal: Is the blob a generated uv.lock?
|
|
428
|
+
#
|
|
429
|
+
# Returns true or false.
|
|
430
|
+
def uv_lock?
|
|
431
|
+
!!name.match(/uv\.lock/)
|
|
432
|
+
end
|
|
433
|
+
|
|
423
434
|
# Internal: Is the blob a generated esy lock file?
|
|
424
435
|
#
|
|
425
436
|
# Returns true or false.
|
|
@@ -427,6 +438,13 @@ module Linguist
|
|
|
427
438
|
!!name.match(/(^|\/)(\w+\.)?esy.lock$/)
|
|
428
439
|
end
|
|
429
440
|
|
|
441
|
+
# Internal: Is the blob a generated deno lockfile, which are not meant for humans in pull requests.
|
|
442
|
+
#
|
|
443
|
+
# Returns true or false.
|
|
444
|
+
def deno_lock?
|
|
445
|
+
!!name.match(/deno\.lock/)
|
|
446
|
+
end
|
|
447
|
+
|
|
430
448
|
# Internal: Is the blob a generated npm shrinkwrap or package lock file?
|
|
431
449
|
#
|
|
432
450
|
# Returns true or false.
|
|
@@ -477,6 +495,13 @@ module Linguist
|
|
|
477
495
|
!!name.match(/Cargo\.lock/)
|
|
478
496
|
end
|
|
479
497
|
|
|
498
|
+
# Internal: Is the blob a generated Rust Cargo original file?
|
|
499
|
+
#
|
|
500
|
+
# Returns true or false.
|
|
501
|
+
def cargo_orig?
|
|
502
|
+
!!name.match(/Cargo\.toml\.orig/)
|
|
503
|
+
end
|
|
504
|
+
|
|
480
505
|
# Internal: Is the blob a generated Nix flakes lock file?
|
|
481
506
|
#
|
|
482
507
|
# Returns true or false
|
|
@@ -484,6 +509,13 @@ module Linguist
|
|
|
484
509
|
!!name.match(/(^|\/)flake\.lock$/)
|
|
485
510
|
end
|
|
486
511
|
|
|
512
|
+
# Internal: Is the blob a Bazel generated bzlmod lockfile?
|
|
513
|
+
#
|
|
514
|
+
# Returns true or false
|
|
515
|
+
def bazel_lock?
|
|
516
|
+
!!name.match(/(^|\/)MODULE\.bazel\.lock$/)
|
|
517
|
+
end
|
|
518
|
+
|
|
487
519
|
# Is the blob a VCR Cassette file?
|
|
488
520
|
#
|
|
489
521
|
# Returns true or false
|
|
@@ -681,14 +713,11 @@ module Linguist
|
|
|
681
713
|
|
|
682
714
|
# Internal: Is this a generated Game Maker Studio (2) metadata file?
|
|
683
715
|
#
|
|
684
|
-
# All Game Maker Studio 2 generated files will be JSON, .yy or .yyp, and have
|
|
685
|
-
# a part that looks like "modelName: GMname" on the 3rd line
|
|
686
|
-
#
|
|
687
716
|
# Return true or false
|
|
688
717
|
def generated_gamemakerstudio?
|
|
689
718
|
return false unless ['.yy', '.yyp'].include? extname
|
|
690
719
|
return false unless lines.count > 3
|
|
691
|
-
return lines
|
|
720
|
+
return lines.first(3).join('').match?(/^\s*[\{\[]/) ||
|
|
692
721
|
lines[0] =~ /^\d\.\d\.\d.+\|\{/
|
|
693
722
|
end
|
|
694
723
|
|
data/lib/linguist/generic.yml
CHANGED
data/lib/linguist/heuristics.rb
CHANGED
|
@@ -126,7 +126,7 @@ module Linguist
|
|
|
126
126
|
# Internal: Perform the heuristic
|
|
127
127
|
def call(data)
|
|
128
128
|
matched = @rules.find do |rule|
|
|
129
|
-
rule['pattern'].match(data)
|
|
129
|
+
rule['pattern'].match?(data)
|
|
130
130
|
end
|
|
131
131
|
if !matched.nil?
|
|
132
132
|
languages = matched['language']
|
|
@@ -145,14 +145,14 @@ module Linguist
|
|
|
145
145
|
@pats = pats
|
|
146
146
|
end
|
|
147
147
|
|
|
148
|
-
def match(input)
|
|
149
|
-
return
|
|
148
|
+
def match?(input)
|
|
149
|
+
return @pats.all? { |pat| pat.match?(input) }
|
|
150
150
|
end
|
|
151
151
|
|
|
152
152
|
end
|
|
153
153
|
|
|
154
154
|
class AlwaysMatch
|
|
155
|
-
def match(input)
|
|
155
|
+
def match?(input)
|
|
156
156
|
return true
|
|
157
157
|
end
|
|
158
158
|
end
|
|
@@ -163,8 +163,8 @@ module Linguist
|
|
|
163
163
|
@pat = pat
|
|
164
164
|
end
|
|
165
165
|
|
|
166
|
-
def match(input)
|
|
167
|
-
return !@pat.match(input)
|
|
166
|
+
def match?(input)
|
|
167
|
+
return !@pat.match?(input)
|
|
168
168
|
end
|
|
169
169
|
|
|
170
170
|
end
|