tongue 0.2.10.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/tongue +46 -0
- data/lib/linguist.rb +6 -0
- data/lib/linguist/blob_helper.rb +333 -0
- data/lib/linguist/classifier.rb +171 -0
- data/lib/linguist/file_blob.rb +58 -0
- data/lib/linguist/generated.rb +241 -0
- data/lib/linguist/heuristics.rb +38 -0
- data/lib/linguist/language.rb +578 -0
- data/lib/linguist/languages.yml +1901 -0
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/popular.yml +29 -0
- data/lib/linguist/repository.rb +95 -0
- data/lib/linguist/samples.json +47115 -0
- data/lib/linguist/samples.rb +149 -0
- data/lib/linguist/tokenizer.rb +198 -0
- data/lib/linguist/vendor.yml +167 -0
- metadata +143 -0
@@ -0,0 +1,149 @@
|
|
1
|
+
begin
|
2
|
+
require 'json'
|
3
|
+
rescue LoadError
|
4
|
+
require 'yaml'
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'linguist/md5'
|
8
|
+
require 'linguist/classifier'
|
9
|
+
|
10
|
+
module Linguist
|
11
|
+
# Model for accessing classifier training data.
|
12
|
+
module Samples
|
13
|
+
# Path to samples root directory
|
14
|
+
ROOT = File.expand_path("../../../samples", __FILE__)
|
15
|
+
|
16
|
+
# Path for serialized samples db
|
17
|
+
PATH = File.expand_path('../samples.json', __FILE__)
|
18
|
+
|
19
|
+
# Hash of serialized samples object
|
20
|
+
if File.exist?(PATH)
|
21
|
+
serializer = defined?(JSON) ? JSON : YAML
|
22
|
+
DATA = serializer.load(File.read(PATH))
|
23
|
+
end
|
24
|
+
|
25
|
+
# Public: Iterate over each sample.
|
26
|
+
#
|
27
|
+
# &block - Yields Sample to block
|
28
|
+
#
|
29
|
+
# Returns nothing.
|
30
|
+
def self.each(&block)
|
31
|
+
Dir.entries(ROOT).each do |category|
|
32
|
+
next if category == '.' || category == '..'
|
33
|
+
|
34
|
+
# Skip text and binary for now
|
35
|
+
# Possibly reconsider this later
|
36
|
+
next if category == 'Text' || category == 'Binary'
|
37
|
+
|
38
|
+
dirname = File.join(ROOT, category)
|
39
|
+
Dir.entries(dirname).each do |filename|
|
40
|
+
next if filename == '.' || filename == '..'
|
41
|
+
|
42
|
+
if filename == 'filenames'
|
43
|
+
Dir.entries(File.join(dirname, filename)).each do |subfilename|
|
44
|
+
next if subfilename == '.' || subfilename == '..'
|
45
|
+
|
46
|
+
yield({
|
47
|
+
:path => File.join(dirname, filename, subfilename),
|
48
|
+
:language => category,
|
49
|
+
:filename => subfilename
|
50
|
+
})
|
51
|
+
end
|
52
|
+
else
|
53
|
+
if File.extname(filename) == ""
|
54
|
+
raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
|
55
|
+
end
|
56
|
+
|
57
|
+
yield({
|
58
|
+
:path => File.join(dirname, filename),
|
59
|
+
:language => category,
|
60
|
+
:interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
|
61
|
+
:extname => File.extname(filename)
|
62
|
+
})
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
# Public: Build Classifier from all samples.
|
71
|
+
#
|
72
|
+
# Returns trained Classifier.
|
73
|
+
def self.data
|
74
|
+
db = {}
|
75
|
+
db['extnames'] = {}
|
76
|
+
db['interpreters'] = {}
|
77
|
+
db['filenames'] = {}
|
78
|
+
|
79
|
+
each do |sample|
|
80
|
+
language_name = sample[:language]
|
81
|
+
|
82
|
+
if sample[:extname]
|
83
|
+
db['extnames'][language_name] ||= []
|
84
|
+
if !db['extnames'][language_name].include?(sample[:extname])
|
85
|
+
db['extnames'][language_name] << sample[:extname]
|
86
|
+
db['extnames'][language_name].sort!
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
if sample[:interpreter]
|
91
|
+
db['interpreters'][language_name] ||= []
|
92
|
+
if !db['interpreters'][language_name].include?(sample[:interpreter])
|
93
|
+
db['interpreters'][language_name] << sample[:interpreter]
|
94
|
+
db['interpreters'][language_name].sort!
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
if sample[:filename]
|
99
|
+
db['filenames'][language_name] ||= []
|
100
|
+
db['filenames'][language_name] << sample[:filename]
|
101
|
+
db['filenames'][language_name].sort!
|
102
|
+
end
|
103
|
+
|
104
|
+
data = File.read(sample[:path])
|
105
|
+
Classifier.train!(db, language_name, data)
|
106
|
+
end
|
107
|
+
|
108
|
+
db['md5'] = Linguist::MD5.hexdigest(db)
|
109
|
+
|
110
|
+
db
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Used to retrieve the interpreter from the shebang line of a file's
|
115
|
+
# data.
|
116
|
+
def self.interpreter_from_shebang(data)
|
117
|
+
lines = data.lines.to_a
|
118
|
+
|
119
|
+
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
|
120
|
+
bang.sub!(/^#! /, '#!')
|
121
|
+
tokens = bang.split(' ')
|
122
|
+
pieces = tokens.first.split('/')
|
123
|
+
|
124
|
+
if pieces.size > 1
|
125
|
+
script = pieces.last
|
126
|
+
else
|
127
|
+
script = pieces.first.sub('#!', '')
|
128
|
+
end
|
129
|
+
|
130
|
+
script = script == 'env' ? tokens[1] : script
|
131
|
+
|
132
|
+
# "python2.6" -> "python"
|
133
|
+
if script =~ /((?:\d+\.?)+)/
|
134
|
+
script.sub! $1, ''
|
135
|
+
end
|
136
|
+
|
137
|
+
# Check for multiline shebang hacks that call `exec`
|
138
|
+
if script == 'sh' &&
|
139
|
+
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
|
140
|
+
script = $1
|
141
|
+
end
|
142
|
+
|
143
|
+
script
|
144
|
+
else
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
@@ -0,0 +1,198 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Linguist
|
4
|
+
# Generic programming language tokenizer.
|
5
|
+
#
|
6
|
+
# Tokens are designed for use in the language bayes classifier.
|
7
|
+
# It strips any data strings or comments and preserves significant
|
8
|
+
# language symbols.
|
9
|
+
class Tokenizer
|
10
|
+
# Public: Extract tokens from data
|
11
|
+
#
|
12
|
+
# data - String to tokenize
|
13
|
+
#
|
14
|
+
# Returns Array of token Strings.
|
15
|
+
def self.tokenize(data)
|
16
|
+
new.extract_tokens(data)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Read up to 100KB
|
20
|
+
BYTE_LIMIT = 100_000
|
21
|
+
|
22
|
+
# Start state on token, ignore anything till the next newline
|
23
|
+
SINGLE_LINE_COMMENTS = [
|
24
|
+
'//', # C
|
25
|
+
'#', # Ruby
|
26
|
+
'%', # Tex
|
27
|
+
]
|
28
|
+
|
29
|
+
# Start state on opening token, ignore anything until the closing
|
30
|
+
# token is reached.
|
31
|
+
MULTI_LINE_COMMENTS = [
|
32
|
+
['/*', '*/'], # C
|
33
|
+
['<!--', '-->'], # XML
|
34
|
+
['{-', '-}'], # Haskell
|
35
|
+
['(*', '*)'], # Coq
|
36
|
+
['"""', '"""'] # Python
|
37
|
+
]
|
38
|
+
|
39
|
+
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
40
|
+
"\s*#{Regexp.escape(c)} "
|
41
|
+
}.join("|"))
|
42
|
+
|
43
|
+
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
44
|
+
Regexp.escape(c[0])
|
45
|
+
}.join("|"))
|
46
|
+
|
47
|
+
# Internal: Extract generic tokens from data.
|
48
|
+
#
|
49
|
+
# data - String to scan.
|
50
|
+
#
|
51
|
+
# Examples
|
52
|
+
#
|
53
|
+
# extract_tokens("printf('Hello')")
|
54
|
+
# # => ['printf', '(', ')']
|
55
|
+
#
|
56
|
+
# Returns Array of token Strings.
|
57
|
+
def extract_tokens(data)
|
58
|
+
s = StringScanner.new(data)
|
59
|
+
|
60
|
+
tokens = []
|
61
|
+
until s.eos?
|
62
|
+
break if s.pos >= BYTE_LIMIT
|
63
|
+
|
64
|
+
if token = s.scan(/^#!.+$/)
|
65
|
+
if name = extract_shebang(token)
|
66
|
+
tokens << "SHEBANG#!#{name}"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Single line comment
|
70
|
+
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
71
|
+
# tokens << token.strip
|
72
|
+
s.skip_until(/\n|\Z/)
|
73
|
+
|
74
|
+
# Multiline comments
|
75
|
+
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
76
|
+
# tokens << token
|
77
|
+
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
78
|
+
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
79
|
+
# tokens << close_token
|
80
|
+
|
81
|
+
# Skip single or double quoted strings
|
82
|
+
elsif s.scan(/"/)
|
83
|
+
if s.peek(1) == "\""
|
84
|
+
s.getch
|
85
|
+
else
|
86
|
+
s.skip_until(/[^\\]"/)
|
87
|
+
end
|
88
|
+
elsif s.scan(/'/)
|
89
|
+
if s.peek(1) == "'"
|
90
|
+
s.getch
|
91
|
+
else
|
92
|
+
s.skip_until(/[^\\]'/)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Skip number literals
|
96
|
+
elsif s.scan(/(0x)?\d(\d|\.)*/)
|
97
|
+
|
98
|
+
# SGML style brackets
|
99
|
+
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
100
|
+
extract_sgml_tokens(token).each { |t| tokens << t }
|
101
|
+
|
102
|
+
# Common programming punctuation
|
103
|
+
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
104
|
+
tokens << token
|
105
|
+
|
106
|
+
# Regular token
|
107
|
+
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
108
|
+
tokens << token
|
109
|
+
|
110
|
+
# Common operators
|
111
|
+
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
112
|
+
tokens << token
|
113
|
+
|
114
|
+
else
|
115
|
+
s.getch
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
tokens
|
120
|
+
end
|
121
|
+
|
122
|
+
# Internal: Extract normalized shebang command token.
|
123
|
+
#
|
124
|
+
# Examples
|
125
|
+
#
|
126
|
+
# extract_shebang("#!/usr/bin/ruby")
|
127
|
+
# # => "ruby"
|
128
|
+
#
|
129
|
+
# extract_shebang("#!/usr/bin/env node")
|
130
|
+
# # => "node"
|
131
|
+
#
|
132
|
+
# Returns String token or nil it couldn't be parsed.
|
133
|
+
def extract_shebang(data)
|
134
|
+
s = StringScanner.new(data)
|
135
|
+
|
136
|
+
if path = s.scan(/^#!\s*\S+/)
|
137
|
+
script = path.split('/').last
|
138
|
+
if script == 'env'
|
139
|
+
s.scan(/\s+/)
|
140
|
+
script = s.scan(/\S+/)
|
141
|
+
end
|
142
|
+
script = script[/[^\d]+/, 0] if script
|
143
|
+
return script
|
144
|
+
end
|
145
|
+
|
146
|
+
nil
|
147
|
+
end
|
148
|
+
|
149
|
+
# Internal: Extract tokens from inside SGML tag.
|
150
|
+
#
|
151
|
+
# data - SGML tag String.
|
152
|
+
#
|
153
|
+
# Examples
|
154
|
+
#
|
155
|
+
# extract_sgml_tokens("<a href='' class=foo>")
|
156
|
+
# # => ["<a>", "href="]
|
157
|
+
#
|
158
|
+
# Returns Array of token Strings.
|
159
|
+
def extract_sgml_tokens(data)
|
160
|
+
s = StringScanner.new(data)
|
161
|
+
|
162
|
+
tokens = []
|
163
|
+
|
164
|
+
until s.eos?
|
165
|
+
# Emit start token
|
166
|
+
if token = s.scan(/<\/?[^\s>]+/)
|
167
|
+
tokens << "#{token}>"
|
168
|
+
|
169
|
+
# Emit attributes with trailing =
|
170
|
+
elsif token = s.scan(/\w+=/)
|
171
|
+
tokens << token
|
172
|
+
|
173
|
+
# Then skip over attribute value
|
174
|
+
if s.scan(/"/)
|
175
|
+
s.skip_until(/[^\\]"/)
|
176
|
+
elsif s.scan(/'/)
|
177
|
+
s.skip_until(/[^\\]'/)
|
178
|
+
else
|
179
|
+
s.skip_until(/\w+/)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Emit lone attributes
|
183
|
+
elsif token = s.scan(/\w+/)
|
184
|
+
tokens << token
|
185
|
+
|
186
|
+
# Stop at the end of the tag
|
187
|
+
elsif s.scan(/>/)
|
188
|
+
s.terminate
|
189
|
+
|
190
|
+
else
|
191
|
+
s.getch
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
tokens
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
# Vendored files and directories are excluded from language
|
2
|
+
# statistics.
|
3
|
+
#
|
4
|
+
# Lines in this file are Regexps that are matched against the file
|
5
|
+
# pathname.
|
6
|
+
#
|
7
|
+
# Please add additional test coverage to
|
8
|
+
# `test/test_blob.rb#test_vendored` if you make any changes.
|
9
|
+
|
10
|
+
## Vendor Conventions ##
|
11
|
+
|
12
|
+
# Caches
|
13
|
+
- cache/
|
14
|
+
|
15
|
+
# Dependencies
|
16
|
+
- ^[Dd]ependencies/
|
17
|
+
|
18
|
+
# C deps
|
19
|
+
# https://github.com/joyent/node
|
20
|
+
- ^deps/
|
21
|
+
- ^tools/
|
22
|
+
- (^|/)configure$
|
23
|
+
- (^|/)configure.ac$
|
24
|
+
- (^|/)config.guess$
|
25
|
+
- (^|/)config.sub$
|
26
|
+
|
27
|
+
# Node dependencies
|
28
|
+
- node_modules/
|
29
|
+
|
30
|
+
# Bower Components
|
31
|
+
- bower_components/
|
32
|
+
|
33
|
+
# Erlang bundles
|
34
|
+
- ^rebar$
|
35
|
+
|
36
|
+
# Bootstrap minified css and js
|
37
|
+
- (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
|
38
|
+
|
39
|
+
# Vendored dependencies
|
40
|
+
- thirdparty/
|
41
|
+
- vendors?/
|
42
|
+
|
43
|
+
# Debian packaging
|
44
|
+
- ^debian/
|
45
|
+
|
46
|
+
## Commonly Bundled JavaScript frameworks ##
|
47
|
+
|
48
|
+
# jQuery
|
49
|
+
- (^|/)jquery([^.]*)(\.min)?\.js$
|
50
|
+
- (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$
|
51
|
+
|
52
|
+
# jQuery UI
|
53
|
+
- (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
|
54
|
+
- (^|/)jquery\.(ui|effects)\.([^.]*)(\.min)?\.(js|css)$
|
55
|
+
|
56
|
+
# Prototype
|
57
|
+
- (^|/)prototype(.*)\.js$
|
58
|
+
- (^|/)effects\.js$
|
59
|
+
- (^|/)controls\.js$
|
60
|
+
- (^|/)dragdrop\.js$
|
61
|
+
|
62
|
+
# MooTools
|
63
|
+
- (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
|
64
|
+
|
65
|
+
# Dojo
|
66
|
+
- (^|/)dojo\.js$
|
67
|
+
|
68
|
+
# MochiKit
|
69
|
+
- (^|/)MochiKit\.js$
|
70
|
+
|
71
|
+
# YUI
|
72
|
+
- (^|/)yahoo-([^.]*)\.js$
|
73
|
+
- (^|/)yui([^.]*)\.js$
|
74
|
+
|
75
|
+
# WYS editors
|
76
|
+
- (^|/)ckeditor\.js$
|
77
|
+
- (^|/)tiny_mce([^.]*)\.js$
|
78
|
+
- (^|/)tiny_mce/(langs|plugins|themes|utils)
|
79
|
+
|
80
|
+
# MathJax
|
81
|
+
- (^|/)MathJax/
|
82
|
+
|
83
|
+
# SyntaxHighlighter - http://alexgorbatchev.com/
|
84
|
+
- (^|/)shBrush([^.]*)\.js$
|
85
|
+
- (^|/)shCore\.js$
|
86
|
+
- (^|/)shLegacy\.js$
|
87
|
+
|
88
|
+
# AngularJS
|
89
|
+
- (^|/)angular([^.]*)(\.min)?\.js$
|
90
|
+
|
91
|
+
## Python ##
|
92
|
+
|
93
|
+
# django
|
94
|
+
- (^|/)admin_media/
|
95
|
+
|
96
|
+
# Fabric
|
97
|
+
- ^fabfile\.py$
|
98
|
+
|
99
|
+
# WAF
|
100
|
+
- ^waf$
|
101
|
+
|
102
|
+
# .osx
|
103
|
+
- ^.osx$
|
104
|
+
|
105
|
+
## Obj-C ##
|
106
|
+
|
107
|
+
# Sparkle
|
108
|
+
- (^|/)Sparkle/
|
109
|
+
|
110
|
+
## .NET ##
|
111
|
+
|
112
|
+
# Visual Studio IntelliSense
|
113
|
+
- -vsdoc\.js$
|
114
|
+
|
115
|
+
# jQuery validation plugin (MS bundles this with asp.net mvc)
|
116
|
+
- (^|/)jquery([^.]*)\.validate(\.unobtrusive)?(\.min)?\.js$
|
117
|
+
- (^|/)jquery([^.]*)\.unobtrusive\-ajax(\.min)?\.js$
|
118
|
+
|
119
|
+
# Microsoft Ajax
|
120
|
+
- (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
|
121
|
+
|
122
|
+
# NuGet
|
123
|
+
- ^[Pp]ackages/
|
124
|
+
|
125
|
+
# ExtJS
|
126
|
+
- (^|/)extjs/.*?\.js$
|
127
|
+
- (^|/)extjs/.*?\.xml$
|
128
|
+
- (^|/)extjs/.*?\.txt$
|
129
|
+
- (^|/)extjs/.*?\.html$
|
130
|
+
- (^|/)extjs/.*?\.properties$
|
131
|
+
- (^|/)extjs/.sencha/
|
132
|
+
- (^|/)extjs/docs/
|
133
|
+
- (^|/)extjs/builds/
|
134
|
+
- (^|/)extjs/cmd/
|
135
|
+
- (^|/)extjs/examples/
|
136
|
+
- (^|/)extjs/locale/
|
137
|
+
- (^|/)extjs/packages/
|
138
|
+
- (^|/)extjs/plugins/
|
139
|
+
- (^|/)extjs/resources/
|
140
|
+
- (^|/)extjs/src/
|
141
|
+
- (^|/)extjs/welcome/
|
142
|
+
|
143
|
+
# Samples folders
|
144
|
+
- ^[Ss]amples/
|
145
|
+
|
146
|
+
# LICENSE, README, git config files
|
147
|
+
- ^COPYING$
|
148
|
+
- LICENSE$
|
149
|
+
- License$
|
150
|
+
- gitattributes$
|
151
|
+
- gitignore$
|
152
|
+
- gitmodules$
|
153
|
+
- ^README$
|
154
|
+
- ^readme$
|
155
|
+
|
156
|
+
# Test fixtures
|
157
|
+
- ^[Tt]est/fixtures/
|
158
|
+
|
159
|
+
# PhoneGap/Cordova
|
160
|
+
- (^|/)cordova([^.]*)(\.min)?\.js$
|
161
|
+
- (^|/)cordova\-\d\.\d(\.\d)?(\.min)?\.js$
|
162
|
+
|
163
|
+
# Vagrant
|
164
|
+
- ^Vagrantfile$
|
165
|
+
|
166
|
+
# .DS_Store's
|
167
|
+
- .[Dd][Ss]_[Ss]tore$
|