github-linguist 5.3.1 → 5.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -109,8 +109,8 @@ module Linguist
109
109
  # Returns an Array with one Language if the blob has a Vim or Emacs modeline
110
110
  # that matches a Language name or alias. Returns an empty array if no match.
111
111
  def self.call(blob, _ = nil)
112
- header = blob.lines.first(SEARCH_SCOPE).join("\n")
113
- footer = blob.lines.last(SEARCH_SCOPE).join("\n")
112
+ header = blob.first_lines(SEARCH_SCOPE).join("\n")
113
+ footer = blob.last_lines(SEARCH_SCOPE).join("\n")
114
114
  Array(Language.find_by_alias(modeline(header + footer)))
115
115
  end
116
116
 
@@ -1,4 +1,5 @@
1
1
  require 'strscan'
2
+ require 'linguist/linguist'
2
3
 
3
4
  module Linguist
4
5
  # Generic programming language tokenizer.
@@ -15,191 +16,5 @@ module Linguist
15
16
  def self.tokenize(data)
16
17
  new.extract_tokens(data)
17
18
  end
18
-
19
- # Read up to 100KB
20
- BYTE_LIMIT = 100_000
21
-
22
- # Start state on token, ignore anything till the next newline
23
- SINGLE_LINE_COMMENTS = [
24
- '//', # C
25
- '--', # Ada, Haskell, AppleScript
26
- '#', # Ruby
27
- '%', # Tex
28
- '"', # Vim
29
- ]
30
-
31
- # Start state on opening token, ignore anything until the closing
32
- # token is reached.
33
- MULTI_LINE_COMMENTS = [
34
- ['/*', '*/'], # C
35
- ['<!--', '-->'], # XML
36
- ['{-', '-}'], # Haskell
37
- ['(*', '*)'], # Coq
38
- ['"""', '"""'], # Python
39
- ["'''", "'''"] # Python
40
- ]
41
-
42
- START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
43
- "\s*#{Regexp.escape(c)} "
44
- }.join("|"))
45
-
46
- START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
47
- Regexp.escape(c[0])
48
- }.join("|"))
49
-
50
- # Internal: Extract generic tokens from data.
51
- #
52
- # data - String to scan.
53
- #
54
- # Examples
55
- #
56
- # extract_tokens("printf('Hello')")
57
- # # => ['printf', '(', ')']
58
- #
59
- # Returns Array of token Strings.
60
- def extract_tokens(data)
61
- s = StringScanner.new(data)
62
-
63
- tokens = []
64
- until s.eos?
65
- break if s.pos >= BYTE_LIMIT
66
-
67
- if token = s.scan(/^#!.+$/)
68
- if name = extract_shebang(token)
69
- tokens << "SHEBANG#!#{name}"
70
- end
71
-
72
- # Single line comment
73
- elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
74
- # tokens << token.strip
75
- s.skip_until(/\n|\Z/)
76
-
77
- # Multiline comments
78
- elsif token = s.scan(START_MULTI_LINE_COMMENT)
79
- # tokens << token
80
- close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
81
- s.skip_until(Regexp.compile(Regexp.escape(close_token)))
82
- # tokens << close_token
83
-
84
- # Skip single or double quoted strings
85
- elsif s.scan(/"/)
86
- if s.peek(1) == "\""
87
- s.getch
88
- else
89
- s.skip_until(/(?<!\\)"/)
90
- end
91
- elsif s.scan(/'/)
92
- if s.peek(1) == "'"
93
- s.getch
94
- else
95
- s.skip_until(/(?<!\\)'/)
96
- end
97
-
98
- # Skip number literals
99
- elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
100
-
101
- # SGML style brackets
102
- elsif token = s.scan(/<[^\s<>][^<>]*>/)
103
- extract_sgml_tokens(token).each { |t| tokens << t }
104
-
105
- # Common programming punctuation
106
- elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
107
- tokens << token
108
-
109
- # Regular token
110
- elsif token = s.scan(/[\w\.@#\/\*]+/)
111
- tokens << token
112
-
113
- # Common operators
114
- elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
115
- tokens << token
116
-
117
- else
118
- s.getch
119
- end
120
- end
121
-
122
- tokens
123
- end
124
-
125
- # Internal: Extract normalized shebang command token.
126
- #
127
- # Examples
128
- #
129
- # extract_shebang("#!/usr/bin/ruby")
130
- # # => "ruby"
131
- #
132
- # extract_shebang("#!/usr/bin/env node")
133
- # # => "node"
134
- #
135
- # extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
136
- # # => "awk"
137
- #
138
- # Returns String token or nil it couldn't be parsed.
139
- def extract_shebang(data)
140
- s = StringScanner.new(data)
141
-
142
- if path = s.scan(/^#!\s*\S+/)
143
- script = path.split('/').last
144
- if script == 'env'
145
- s.scan(/\s+/)
146
- s.scan(/.*=[^\s]+\s+/)
147
- script = s.scan(/\S+/)
148
- end
149
- script = script[/[^\d]+/, 0] if script
150
- return script
151
- end
152
-
153
- nil
154
- end
155
-
156
- # Internal: Extract tokens from inside SGML tag.
157
- #
158
- # data - SGML tag String.
159
- #
160
- # Examples
161
- #
162
- # extract_sgml_tokens("<a href='' class=foo>")
163
- # # => ["<a>", "href="]
164
- #
165
- # Returns Array of token Strings.
166
- def extract_sgml_tokens(data)
167
- s = StringScanner.new(data)
168
-
169
- tokens = []
170
-
171
- until s.eos?
172
- # Emit start token
173
- if token = s.scan(/<\/?[^\s>]+/)
174
- tokens << "#{token}>"
175
-
176
- # Emit attributes with trailing =
177
- elsif token = s.scan(/\w+=/)
178
- tokens << token
179
-
180
- # Then skip over attribute value
181
- if s.scan(/"/)
182
- s.skip_until(/[^\\]"/)
183
- elsif s.scan(/'/)
184
- s.skip_until(/[^\\]'/)
185
- else
186
- s.skip_until(/\w+/)
187
- end
188
-
189
- # Emit lone attributes
190
- elsif token = s.scan(/\w+/)
191
- tokens << token
192
-
193
- # Stop at the end of the tag
194
- elsif s.scan(/>/)
195
- s.terminate
196
-
197
- else
198
- s.getch
199
- end
200
- end
201
-
202
- tokens
203
- end
204
19
  end
205
20
  end
@@ -1,3 +1,3 @@
1
1
  module Linguist
2
- VERSION = "5.3.1"
2
+ VERSION = "5.3.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github-linguist
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.3.1
4
+ version: 5.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - GitHub
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-17 00:00:00.000000000 Z
11
+ date: 2017-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: charlock_holmes
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '5.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake-compiler
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.9'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.9'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: mocha
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -199,12 +213,19 @@ email:
199
213
  executables:
200
214
  - linguist
201
215
  - git-linguist
202
- extensions: []
216
+ extensions:
217
+ - ext/linguist/extconf.rb
203
218
  extra_rdoc_files: []
204
219
  files:
205
220
  - LICENSE
206
221
  - bin/git-linguist
207
222
  - bin/linguist
223
+ - ext/linguist/extconf.rb
224
+ - ext/linguist/lex.linguist_yy.c
225
+ - ext/linguist/lex.linguist_yy.h
226
+ - ext/linguist/linguist.c
227
+ - ext/linguist/linguist.h
228
+ - ext/linguist/tokenizer.l
208
229
  - grammars/annotation.liquidhaskell.haskell.json
209
230
  - grammars/config.xcompose.json
210
231
  - grammars/file.lasso.json
@@ -651,6 +672,7 @@ files:
651
672
  - lib/linguist/languages.json
652
673
  - lib/linguist/languages.yml
653
674
  - lib/linguist/lazy_blob.rb
675
+ - lib/linguist/linguist.bundle
654
676
  - lib/linguist/md5.rb
655
677
  - lib/linguist/popular.yml
656
678
  - lib/linguist/repository.rb