github-linguist 5.3.1 → 5.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/linguist/extconf.rb +3 -0
- data/ext/linguist/lex.linguist_yy.c +8269 -0
- data/ext/linguist/lex.linguist_yy.h +353 -0
- data/ext/linguist/linguist.c +64 -0
- data/ext/linguist/linguist.h +11 -0
- data/ext/linguist/tokenizer.l +119 -0
- data/grammars/source.coffee.json +123 -41
- data/grammars/source.crystal.json +2 -2
- data/grammars/source.css.less.json +319 -27
- data/grammars/source.glsl.json +1 -1
- data/grammars/source.js.json +6 -2
- data/grammars/source.meson.json +1 -1
- data/grammars/source.tsx.json +4 -14
- data/grammars/source.wdl.json +2 -2
- data/grammars/text.roff.json +155 -41
- data/grammars/text.shell-session.json +1 -1
- data/lib/linguist/blob_helper.rb +47 -4
- data/lib/linguist/classifier.rb +3 -1
- data/lib/linguist/file_blob.rb +3 -3
- data/lib/linguist/heuristics.rb +15 -6
- data/lib/linguist/linguist.bundle +0 -0
- data/lib/linguist/samples.json +49989 -44225
- data/lib/linguist/strategy/modeline.rb +2 -2
- data/lib/linguist/tokenizer.rb +1 -186
- data/lib/linguist/version.rb +1 -1
- metadata +25 -3
@@ -109,8 +109,8 @@ module Linguist
|
|
109
109
|
# Returns an Array with one Language if the blob has a Vim or Emacs modeline
|
110
110
|
# that matches a Language name or alias. Returns an empty array if no match.
|
111
111
|
def self.call(blob, _ = nil)
|
112
|
-
header = blob.
|
113
|
-
footer = blob.
|
112
|
+
header = blob.first_lines(SEARCH_SCOPE).join("\n")
|
113
|
+
footer = blob.last_lines(SEARCH_SCOPE).join("\n")
|
114
114
|
Array(Language.find_by_alias(modeline(header + footer)))
|
115
115
|
end
|
116
116
|
|
data/lib/linguist/tokenizer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'strscan'
|
2
|
+
require 'linguist/linguist'
|
2
3
|
|
3
4
|
module Linguist
|
4
5
|
# Generic programming language tokenizer.
|
@@ -15,191 +16,5 @@ module Linguist
|
|
15
16
|
def self.tokenize(data)
|
16
17
|
new.extract_tokens(data)
|
17
18
|
end
|
18
|
-
|
19
|
-
# Read up to 100KB
|
20
|
-
BYTE_LIMIT = 100_000
|
21
|
-
|
22
|
-
# Start state on token, ignore anything till the next newline
|
23
|
-
SINGLE_LINE_COMMENTS = [
|
24
|
-
'//', # C
|
25
|
-
'--', # Ada, Haskell, AppleScript
|
26
|
-
'#', # Ruby
|
27
|
-
'%', # Tex
|
28
|
-
'"', # Vim
|
29
|
-
]
|
30
|
-
|
31
|
-
# Start state on opening token, ignore anything until the closing
|
32
|
-
# token is reached.
|
33
|
-
MULTI_LINE_COMMENTS = [
|
34
|
-
['/*', '*/'], # C
|
35
|
-
['<!--', '-->'], # XML
|
36
|
-
['{-', '-}'], # Haskell
|
37
|
-
['(*', '*)'], # Coq
|
38
|
-
['"""', '"""'], # Python
|
39
|
-
["'''", "'''"] # Python
|
40
|
-
]
|
41
|
-
|
42
|
-
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
43
|
-
"\s*#{Regexp.escape(c)} "
|
44
|
-
}.join("|"))
|
45
|
-
|
46
|
-
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
47
|
-
Regexp.escape(c[0])
|
48
|
-
}.join("|"))
|
49
|
-
|
50
|
-
# Internal: Extract generic tokens from data.
|
51
|
-
#
|
52
|
-
# data - String to scan.
|
53
|
-
#
|
54
|
-
# Examples
|
55
|
-
#
|
56
|
-
# extract_tokens("printf('Hello')")
|
57
|
-
# # => ['printf', '(', ')']
|
58
|
-
#
|
59
|
-
# Returns Array of token Strings.
|
60
|
-
def extract_tokens(data)
|
61
|
-
s = StringScanner.new(data)
|
62
|
-
|
63
|
-
tokens = []
|
64
|
-
until s.eos?
|
65
|
-
break if s.pos >= BYTE_LIMIT
|
66
|
-
|
67
|
-
if token = s.scan(/^#!.+$/)
|
68
|
-
if name = extract_shebang(token)
|
69
|
-
tokens << "SHEBANG#!#{name}"
|
70
|
-
end
|
71
|
-
|
72
|
-
# Single line comment
|
73
|
-
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
74
|
-
# tokens << token.strip
|
75
|
-
s.skip_until(/\n|\Z/)
|
76
|
-
|
77
|
-
# Multiline comments
|
78
|
-
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
79
|
-
# tokens << token
|
80
|
-
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
81
|
-
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
82
|
-
# tokens << close_token
|
83
|
-
|
84
|
-
# Skip single or double quoted strings
|
85
|
-
elsif s.scan(/"/)
|
86
|
-
if s.peek(1) == "\""
|
87
|
-
s.getch
|
88
|
-
else
|
89
|
-
s.skip_until(/(?<!\\)"/)
|
90
|
-
end
|
91
|
-
elsif s.scan(/'/)
|
92
|
-
if s.peek(1) == "'"
|
93
|
-
s.getch
|
94
|
-
else
|
95
|
-
s.skip_until(/(?<!\\)'/)
|
96
|
-
end
|
97
|
-
|
98
|
-
# Skip number literals
|
99
|
-
elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
|
100
|
-
|
101
|
-
# SGML style brackets
|
102
|
-
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
103
|
-
extract_sgml_tokens(token).each { |t| tokens << t }
|
104
|
-
|
105
|
-
# Common programming punctuation
|
106
|
-
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
107
|
-
tokens << token
|
108
|
-
|
109
|
-
# Regular token
|
110
|
-
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
111
|
-
tokens << token
|
112
|
-
|
113
|
-
# Common operators
|
114
|
-
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
115
|
-
tokens << token
|
116
|
-
|
117
|
-
else
|
118
|
-
s.getch
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
tokens
|
123
|
-
end
|
124
|
-
|
125
|
-
# Internal: Extract normalized shebang command token.
|
126
|
-
#
|
127
|
-
# Examples
|
128
|
-
#
|
129
|
-
# extract_shebang("#!/usr/bin/ruby")
|
130
|
-
# # => "ruby"
|
131
|
-
#
|
132
|
-
# extract_shebang("#!/usr/bin/env node")
|
133
|
-
# # => "node"
|
134
|
-
#
|
135
|
-
# extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
|
136
|
-
# # => "awk"
|
137
|
-
#
|
138
|
-
# Returns String token or nil it couldn't be parsed.
|
139
|
-
def extract_shebang(data)
|
140
|
-
s = StringScanner.new(data)
|
141
|
-
|
142
|
-
if path = s.scan(/^#!\s*\S+/)
|
143
|
-
script = path.split('/').last
|
144
|
-
if script == 'env'
|
145
|
-
s.scan(/\s+/)
|
146
|
-
s.scan(/.*=[^\s]+\s+/)
|
147
|
-
script = s.scan(/\S+/)
|
148
|
-
end
|
149
|
-
script = script[/[^\d]+/, 0] if script
|
150
|
-
return script
|
151
|
-
end
|
152
|
-
|
153
|
-
nil
|
154
|
-
end
|
155
|
-
|
156
|
-
# Internal: Extract tokens from inside SGML tag.
|
157
|
-
#
|
158
|
-
# data - SGML tag String.
|
159
|
-
#
|
160
|
-
# Examples
|
161
|
-
#
|
162
|
-
# extract_sgml_tokens("<a href='' class=foo>")
|
163
|
-
# # => ["<a>", "href="]
|
164
|
-
#
|
165
|
-
# Returns Array of token Strings.
|
166
|
-
def extract_sgml_tokens(data)
|
167
|
-
s = StringScanner.new(data)
|
168
|
-
|
169
|
-
tokens = []
|
170
|
-
|
171
|
-
until s.eos?
|
172
|
-
# Emit start token
|
173
|
-
if token = s.scan(/<\/?[^\s>]+/)
|
174
|
-
tokens << "#{token}>"
|
175
|
-
|
176
|
-
# Emit attributes with trailing =
|
177
|
-
elsif token = s.scan(/\w+=/)
|
178
|
-
tokens << token
|
179
|
-
|
180
|
-
# Then skip over attribute value
|
181
|
-
if s.scan(/"/)
|
182
|
-
s.skip_until(/[^\\]"/)
|
183
|
-
elsif s.scan(/'/)
|
184
|
-
s.skip_until(/[^\\]'/)
|
185
|
-
else
|
186
|
-
s.skip_until(/\w+/)
|
187
|
-
end
|
188
|
-
|
189
|
-
# Emit lone attributes
|
190
|
-
elsif token = s.scan(/\w+/)
|
191
|
-
tokens << token
|
192
|
-
|
193
|
-
# Stop at the end of the tag
|
194
|
-
elsif s.scan(/>/)
|
195
|
-
s.terminate
|
196
|
-
|
197
|
-
else
|
198
|
-
s.getch
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
tokens
|
203
|
-
end
|
204
19
|
end
|
205
20
|
end
|
data/lib/linguist/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: github-linguist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.3.
|
4
|
+
version: 5.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- GitHub
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-10-
|
11
|
+
date: 2017-10-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: charlock_holmes
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '5.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake-compiler
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.9'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.9'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: mocha
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -199,12 +213,19 @@ email:
|
|
199
213
|
executables:
|
200
214
|
- linguist
|
201
215
|
- git-linguist
|
202
|
-
extensions:
|
216
|
+
extensions:
|
217
|
+
- ext/linguist/extconf.rb
|
203
218
|
extra_rdoc_files: []
|
204
219
|
files:
|
205
220
|
- LICENSE
|
206
221
|
- bin/git-linguist
|
207
222
|
- bin/linguist
|
223
|
+
- ext/linguist/extconf.rb
|
224
|
+
- ext/linguist/lex.linguist_yy.c
|
225
|
+
- ext/linguist/lex.linguist_yy.h
|
226
|
+
- ext/linguist/linguist.c
|
227
|
+
- ext/linguist/linguist.h
|
228
|
+
- ext/linguist/tokenizer.l
|
208
229
|
- grammars/annotation.liquidhaskell.haskell.json
|
209
230
|
- grammars/config.xcompose.json
|
210
231
|
- grammars/file.lasso.json
|
@@ -651,6 +672,7 @@ files:
|
|
651
672
|
- lib/linguist/languages.json
|
652
673
|
- lib/linguist/languages.yml
|
653
674
|
- lib/linguist/lazy_blob.rb
|
675
|
+
- lib/linguist/linguist.bundle
|
654
676
|
- lib/linguist/md5.rb
|
655
677
|
- lib/linguist/popular.yml
|
656
678
|
- lib/linguist/repository.rb
|