github-linguist 5.3.1 → 5.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/linguist/extconf.rb +3 -0
- data/ext/linguist/lex.linguist_yy.c +8269 -0
- data/ext/linguist/lex.linguist_yy.h +353 -0
- data/ext/linguist/linguist.c +64 -0
- data/ext/linguist/linguist.h +11 -0
- data/ext/linguist/tokenizer.l +119 -0
- data/grammars/source.coffee.json +123 -41
- data/grammars/source.crystal.json +2 -2
- data/grammars/source.css.less.json +319 -27
- data/grammars/source.glsl.json +1 -1
- data/grammars/source.js.json +6 -2
- data/grammars/source.meson.json +1 -1
- data/grammars/source.tsx.json +4 -14
- data/grammars/source.wdl.json +2 -2
- data/grammars/text.roff.json +155 -41
- data/grammars/text.shell-session.json +1 -1
- data/lib/linguist/blob_helper.rb +47 -4
- data/lib/linguist/classifier.rb +3 -1
- data/lib/linguist/file_blob.rb +3 -3
- data/lib/linguist/heuristics.rb +15 -6
- data/lib/linguist/linguist.bundle +0 -0
- data/lib/linguist/samples.json +49989 -44225
- data/lib/linguist/strategy/modeline.rb +2 -2
- data/lib/linguist/tokenizer.rb +1 -186
- data/lib/linguist/version.rb +1 -1
- metadata +25 -3
@@ -109,8 +109,8 @@ module Linguist
|
|
109
109
|
# Returns an Array with one Language if the blob has a Vim or Emacs modeline
|
110
110
|
# that matches a Language name or alias. Returns an empty array if no match.
|
111
111
|
def self.call(blob, _ = nil)
|
112
|
-
header = blob.
|
113
|
-
footer = blob.
|
112
|
+
header = blob.first_lines(SEARCH_SCOPE).join("\n")
|
113
|
+
footer = blob.last_lines(SEARCH_SCOPE).join("\n")
|
114
114
|
Array(Language.find_by_alias(modeline(header + footer)))
|
115
115
|
end
|
116
116
|
|
data/lib/linguist/tokenizer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'strscan'
|
2
|
+
require 'linguist/linguist'
|
2
3
|
|
3
4
|
module Linguist
|
4
5
|
# Generic programming language tokenizer.
|
@@ -15,191 +16,5 @@ module Linguist
|
|
15
16
|
def self.tokenize(data)
|
16
17
|
new.extract_tokens(data)
|
17
18
|
end
|
18
|
-
|
19
|
-
# Read up to 100KB
|
20
|
-
BYTE_LIMIT = 100_000
|
21
|
-
|
22
|
-
# Start state on token, ignore anything till the next newline
|
23
|
-
SINGLE_LINE_COMMENTS = [
|
24
|
-
'//', # C
|
25
|
-
'--', # Ada, Haskell, AppleScript
|
26
|
-
'#', # Ruby
|
27
|
-
'%', # Tex
|
28
|
-
'"', # Vim
|
29
|
-
]
|
30
|
-
|
31
|
-
# Start state on opening token, ignore anything until the closing
|
32
|
-
# token is reached.
|
33
|
-
MULTI_LINE_COMMENTS = [
|
34
|
-
['/*', '*/'], # C
|
35
|
-
['<!--', '-->'], # XML
|
36
|
-
['{-', '-}'], # Haskell
|
37
|
-
['(*', '*)'], # Coq
|
38
|
-
['"""', '"""'], # Python
|
39
|
-
["'''", "'''"] # Python
|
40
|
-
]
|
41
|
-
|
42
|
-
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
43
|
-
"\s*#{Regexp.escape(c)} "
|
44
|
-
}.join("|"))
|
45
|
-
|
46
|
-
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
47
|
-
Regexp.escape(c[0])
|
48
|
-
}.join("|"))
|
49
|
-
|
50
|
-
# Internal: Extract generic tokens from data.
|
51
|
-
#
|
52
|
-
# data - String to scan.
|
53
|
-
#
|
54
|
-
# Examples
|
55
|
-
#
|
56
|
-
# extract_tokens("printf('Hello')")
|
57
|
-
# # => ['printf', '(', ')']
|
58
|
-
#
|
59
|
-
# Returns Array of token Strings.
|
60
|
-
def extract_tokens(data)
|
61
|
-
s = StringScanner.new(data)
|
62
|
-
|
63
|
-
tokens = []
|
64
|
-
until s.eos?
|
65
|
-
break if s.pos >= BYTE_LIMIT
|
66
|
-
|
67
|
-
if token = s.scan(/^#!.+$/)
|
68
|
-
if name = extract_shebang(token)
|
69
|
-
tokens << "SHEBANG#!#{name}"
|
70
|
-
end
|
71
|
-
|
72
|
-
# Single line comment
|
73
|
-
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
74
|
-
# tokens << token.strip
|
75
|
-
s.skip_until(/\n|\Z/)
|
76
|
-
|
77
|
-
# Multiline comments
|
78
|
-
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
79
|
-
# tokens << token
|
80
|
-
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
81
|
-
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
82
|
-
# tokens << close_token
|
83
|
-
|
84
|
-
# Skip single or double quoted strings
|
85
|
-
elsif s.scan(/"/)
|
86
|
-
if s.peek(1) == "\""
|
87
|
-
s.getch
|
88
|
-
else
|
89
|
-
s.skip_until(/(?<!\\)"/)
|
90
|
-
end
|
91
|
-
elsif s.scan(/'/)
|
92
|
-
if s.peek(1) == "'"
|
93
|
-
s.getch
|
94
|
-
else
|
95
|
-
s.skip_until(/(?<!\\)'/)
|
96
|
-
end
|
97
|
-
|
98
|
-
# Skip number literals
|
99
|
-
elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
|
100
|
-
|
101
|
-
# SGML style brackets
|
102
|
-
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
103
|
-
extract_sgml_tokens(token).each { |t| tokens << t }
|
104
|
-
|
105
|
-
# Common programming punctuation
|
106
|
-
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
107
|
-
tokens << token
|
108
|
-
|
109
|
-
# Regular token
|
110
|
-
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
111
|
-
tokens << token
|
112
|
-
|
113
|
-
# Common operators
|
114
|
-
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
115
|
-
tokens << token
|
116
|
-
|
117
|
-
else
|
118
|
-
s.getch
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
tokens
|
123
|
-
end
|
124
|
-
|
125
|
-
# Internal: Extract normalized shebang command token.
|
126
|
-
#
|
127
|
-
# Examples
|
128
|
-
#
|
129
|
-
# extract_shebang("#!/usr/bin/ruby")
|
130
|
-
# # => "ruby"
|
131
|
-
#
|
132
|
-
# extract_shebang("#!/usr/bin/env node")
|
133
|
-
# # => "node"
|
134
|
-
#
|
135
|
-
# extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
|
136
|
-
# # => "awk"
|
137
|
-
#
|
138
|
-
# Returns String token or nil it couldn't be parsed.
|
139
|
-
def extract_shebang(data)
|
140
|
-
s = StringScanner.new(data)
|
141
|
-
|
142
|
-
if path = s.scan(/^#!\s*\S+/)
|
143
|
-
script = path.split('/').last
|
144
|
-
if script == 'env'
|
145
|
-
s.scan(/\s+/)
|
146
|
-
s.scan(/.*=[^\s]+\s+/)
|
147
|
-
script = s.scan(/\S+/)
|
148
|
-
end
|
149
|
-
script = script[/[^\d]+/, 0] if script
|
150
|
-
return script
|
151
|
-
end
|
152
|
-
|
153
|
-
nil
|
154
|
-
end
|
155
|
-
|
156
|
-
# Internal: Extract tokens from inside SGML tag.
|
157
|
-
#
|
158
|
-
# data - SGML tag String.
|
159
|
-
#
|
160
|
-
# Examples
|
161
|
-
#
|
162
|
-
# extract_sgml_tokens("<a href='' class=foo>")
|
163
|
-
# # => ["<a>", "href="]
|
164
|
-
#
|
165
|
-
# Returns Array of token Strings.
|
166
|
-
def extract_sgml_tokens(data)
|
167
|
-
s = StringScanner.new(data)
|
168
|
-
|
169
|
-
tokens = []
|
170
|
-
|
171
|
-
until s.eos?
|
172
|
-
# Emit start token
|
173
|
-
if token = s.scan(/<\/?[^\s>]+/)
|
174
|
-
tokens << "#{token}>"
|
175
|
-
|
176
|
-
# Emit attributes with trailing =
|
177
|
-
elsif token = s.scan(/\w+=/)
|
178
|
-
tokens << token
|
179
|
-
|
180
|
-
# Then skip over attribute value
|
181
|
-
if s.scan(/"/)
|
182
|
-
s.skip_until(/[^\\]"/)
|
183
|
-
elsif s.scan(/'/)
|
184
|
-
s.skip_until(/[^\\]'/)
|
185
|
-
else
|
186
|
-
s.skip_until(/\w+/)
|
187
|
-
end
|
188
|
-
|
189
|
-
# Emit lone attributes
|
190
|
-
elsif token = s.scan(/\w+/)
|
191
|
-
tokens << token
|
192
|
-
|
193
|
-
# Stop at the end of the tag
|
194
|
-
elsif s.scan(/>/)
|
195
|
-
s.terminate
|
196
|
-
|
197
|
-
else
|
198
|
-
s.getch
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
tokens
|
203
|
-
end
|
204
19
|
end
|
205
20
|
end
|
data/lib/linguist/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: github-linguist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.3.
|
4
|
+
version: 5.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- GitHub
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-10-
|
11
|
+
date: 2017-10-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: charlock_holmes
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '5.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake-compiler
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.9'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.9'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: mocha
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -199,12 +213,19 @@ email:
|
|
199
213
|
executables:
|
200
214
|
- linguist
|
201
215
|
- git-linguist
|
202
|
-
extensions:
|
216
|
+
extensions:
|
217
|
+
- ext/linguist/extconf.rb
|
203
218
|
extra_rdoc_files: []
|
204
219
|
files:
|
205
220
|
- LICENSE
|
206
221
|
- bin/git-linguist
|
207
222
|
- bin/linguist
|
223
|
+
- ext/linguist/extconf.rb
|
224
|
+
- ext/linguist/lex.linguist_yy.c
|
225
|
+
- ext/linguist/lex.linguist_yy.h
|
226
|
+
- ext/linguist/linguist.c
|
227
|
+
- ext/linguist/linguist.h
|
228
|
+
- ext/linguist/tokenizer.l
|
208
229
|
- grammars/annotation.liquidhaskell.haskell.json
|
209
230
|
- grammars/config.xcompose.json
|
210
231
|
- grammars/file.lasso.json
|
@@ -651,6 +672,7 @@ files:
|
|
651
672
|
- lib/linguist/languages.json
|
652
673
|
- lib/linguist/languages.yml
|
653
674
|
- lib/linguist/lazy_blob.rb
|
675
|
+
- lib/linguist/linguist.bundle
|
654
676
|
- lib/linguist/md5.rb
|
655
677
|
- lib/linguist/popular.yml
|
656
678
|
- lib/linguist/repository.rb
|