github-linguist 2.1.2 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/linguist.rb +1 -0
- data/lib/linguist/blob_helper.rb +7 -241
- data/lib/linguist/generated.rb +161 -0
- data/lib/linguist/language.rb +37 -54
- data/lib/linguist/languages.yml +5 -41
- data/lib/linguist/samples.json +12055 -5573
- data/lib/linguist/samples.rb +7 -5
- data/lib/linguist/tokenizer.rb +47 -5
- metadata +3 -2
data/lib/linguist/samples.rb
CHANGED
@@ -45,6 +45,10 @@ module Linguist
|
|
45
45
|
})
|
46
46
|
end
|
47
47
|
else
|
48
|
+
if File.extname(filename) == ""
|
49
|
+
raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
|
50
|
+
end
|
51
|
+
|
48
52
|
yield({
|
49
53
|
:path => File.join(dirname, filename),
|
50
54
|
:language => category,
|
@@ -68,18 +72,16 @@ module Linguist
|
|
68
72
|
each do |sample|
|
69
73
|
language_name = sample[:language]
|
70
74
|
|
71
|
-
|
72
|
-
if sample[:extname] && sample[:extname] != ""
|
75
|
+
if sample[:extname]
|
73
76
|
db['extnames'][language_name] ||= []
|
74
77
|
if !db['extnames'][language_name].include?(sample[:extname])
|
75
78
|
db['extnames'][language_name] << sample[:extname]
|
76
79
|
end
|
77
80
|
end
|
78
81
|
|
79
|
-
|
80
|
-
if fn = sample[:filename]
|
82
|
+
if sample[:filename]
|
81
83
|
db['filenames'][language_name] ||= []
|
82
|
-
db['filenames'][language_name] <<
|
84
|
+
db['filenames'][language_name] << sample[:filename]
|
83
85
|
end
|
84
86
|
|
85
87
|
data = File.read(sample[:path])
|
data/lib/linguist/tokenizer.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module Linguist
|
2
4
|
# Generic programming language tokenizer.
|
3
5
|
#
|
@@ -50,8 +52,13 @@ module Linguist
|
|
50
52
|
|
51
53
|
tokens = []
|
52
54
|
until s.eos?
|
55
|
+
if token = s.scan(/^#!.+$/)
|
56
|
+
if name = extract_shebang(token)
|
57
|
+
tokens << "SHEBANG#!#{name}"
|
58
|
+
end
|
59
|
+
|
53
60
|
# Single line comment
|
54
|
-
|
61
|
+
elsif token = s.scan(START_SINGLE_LINE_COMMENT)
|
55
62
|
tokens << token.strip
|
56
63
|
s.skip_until(/\n|\Z/)
|
57
64
|
|
@@ -64,19 +71,27 @@ module Linguist
|
|
64
71
|
|
65
72
|
# Skip single or double quoted strings
|
66
73
|
elsif s.scan(/"/)
|
67
|
-
s.
|
74
|
+
if s.peek(1) == "\""
|
75
|
+
s.getch
|
76
|
+
else
|
77
|
+
s.skip_until(/[^\\]"/)
|
78
|
+
end
|
68
79
|
elsif s.scan(/'/)
|
69
|
-
s.
|
80
|
+
if s.peek(1) == "'"
|
81
|
+
s.getch
|
82
|
+
else
|
83
|
+
s.skip_until(/[^\\]'/)
|
84
|
+
end
|
70
85
|
|
71
86
|
# Skip number literals
|
72
|
-
elsif s.scan(/(0x)?\d
|
87
|
+
elsif s.scan(/(0x)?\d(\d|\.)*/)
|
73
88
|
|
74
89
|
# SGML style brackets
|
75
90
|
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
76
91
|
extract_sgml_tokens(token).each { |t| tokens << t }
|
77
92
|
|
78
93
|
# Common programming punctuation
|
79
|
-
elsif token = s.scan(/;|\{|\}|\(|\)/)
|
94
|
+
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
80
95
|
tokens << token
|
81
96
|
|
82
97
|
# Regular token
|
@@ -95,6 +110,33 @@ module Linguist
|
|
95
110
|
tokens
|
96
111
|
end
|
97
112
|
|
113
|
+
# Internal: Extract normalized shebang command token.
|
114
|
+
#
|
115
|
+
# Examples
|
116
|
+
#
|
117
|
+
# extract_shebang("#!/usr/bin/ruby")
|
118
|
+
# # => "ruby"
|
119
|
+
#
|
120
|
+
# extract_shebang("#!/usr/bin/env node")
|
121
|
+
# # => "node"
|
122
|
+
#
|
123
|
+
# Returns String token or nil it couldn't be parsed.
|
124
|
+
def extract_shebang(data)
|
125
|
+
s = StringScanner.new(data)
|
126
|
+
|
127
|
+
if path = s.scan(/^#!\s*\S+/)
|
128
|
+
script = path.split('/').last
|
129
|
+
if script == 'env'
|
130
|
+
s.scan(/\s+/)
|
131
|
+
script = s.scan(/\S+/)
|
132
|
+
end
|
133
|
+
script = script[/[^\d]+/, 0]
|
134
|
+
return script
|
135
|
+
end
|
136
|
+
|
137
|
+
nil
|
138
|
+
end
|
139
|
+
|
98
140
|
# Internal: Extract tokens from inside SGML tag.
|
99
141
|
#
|
100
142
|
# data - SGML tag String.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: github-linguist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: charlock_holmes
|
@@ -133,6 +133,7 @@ files:
|
|
133
133
|
- lib/linguist/blob_helper.rb
|
134
134
|
- lib/linguist/classifier.rb
|
135
135
|
- lib/linguist/file_blob.rb
|
136
|
+
- lib/linguist/generated.rb
|
136
137
|
- lib/linguist/language.rb
|
137
138
|
- lib/linguist/languages.yml
|
138
139
|
- lib/linguist/md5.rb
|