github-linguist 5.3.1 → 5.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -109,8 +109,8 @@ module Linguist
109
109
  # Returns an Array with one Language if the blob has a Vim or Emacs modeline
110
110
  # that matches a Language name or alias. Returns an empty array if no match.
111
111
  def self.call(blob, _ = nil)
112
- header = blob.lines.first(SEARCH_SCOPE).join("\n")
113
- footer = blob.lines.last(SEARCH_SCOPE).join("\n")
112
+ header = blob.first_lines(SEARCH_SCOPE).join("\n")
113
+ footer = blob.last_lines(SEARCH_SCOPE).join("\n")
114
114
  Array(Language.find_by_alias(modeline(header + footer)))
115
115
  end
116
116
 
@@ -1,4 +1,5 @@
1
1
  require 'strscan'
2
+ require 'linguist/linguist'
2
3
 
3
4
  module Linguist
4
5
  # Generic programming language tokenizer.
@@ -15,191 +16,5 @@ module Linguist
15
16
  def self.tokenize(data)
16
17
  new.extract_tokens(data)
17
18
  end
18
-
19
- # Read up to 100KB
20
- BYTE_LIMIT = 100_000
21
-
22
- # Start state on token, ignore anything till the next newline
23
- SINGLE_LINE_COMMENTS = [
24
- '//', # C
25
- '--', # Ada, Haskell, AppleScript
26
- '#', # Ruby
27
- '%', # Tex
28
- '"', # Vim
29
- ]
30
-
31
- # Start state on opening token, ignore anything until the closing
32
- # token is reached.
33
- MULTI_LINE_COMMENTS = [
34
- ['/*', '*/'], # C
35
- ['<!--', '-->'], # XML
36
- ['{-', '-}'], # Haskell
37
- ['(*', '*)'], # Coq
38
- ['"""', '"""'], # Python
39
- ["'''", "'''"] # Python
40
- ]
41
-
42
- START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
43
- "\s*#{Regexp.escape(c)} "
44
- }.join("|"))
45
-
46
- START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
47
- Regexp.escape(c[0])
48
- }.join("|"))
49
-
50
- # Internal: Extract generic tokens from data.
51
- #
52
- # data - String to scan.
53
- #
54
- # Examples
55
- #
56
- # extract_tokens("printf('Hello')")
57
- # # => ['printf', '(', ')']
58
- #
59
- # Returns Array of token Strings.
60
- def extract_tokens(data)
61
- s = StringScanner.new(data)
62
-
63
- tokens = []
64
- until s.eos?
65
- break if s.pos >= BYTE_LIMIT
66
-
67
- if token = s.scan(/^#!.+$/)
68
- if name = extract_shebang(token)
69
- tokens << "SHEBANG#!#{name}"
70
- end
71
-
72
- # Single line comment
73
- elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
74
- # tokens << token.strip
75
- s.skip_until(/\n|\Z/)
76
-
77
- # Multiline comments
78
- elsif token = s.scan(START_MULTI_LINE_COMMENT)
79
- # tokens << token
80
- close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
81
- s.skip_until(Regexp.compile(Regexp.escape(close_token)))
82
- # tokens << close_token
83
-
84
- # Skip single or double quoted strings
85
- elsif s.scan(/"/)
86
- if s.peek(1) == "\""
87
- s.getch
88
- else
89
- s.skip_until(/(?<!\\)"/)
90
- end
91
- elsif s.scan(/'/)
92
- if s.peek(1) == "'"
93
- s.getch
94
- else
95
- s.skip_until(/(?<!\\)'/)
96
- end
97
-
98
- # Skip number literals
99
- elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
100
-
101
- # SGML style brackets
102
- elsif token = s.scan(/<[^\s<>][^<>]*>/)
103
- extract_sgml_tokens(token).each { |t| tokens << t }
104
-
105
- # Common programming punctuation
106
- elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
107
- tokens << token
108
-
109
- # Regular token
110
- elsif token = s.scan(/[\w\.@#\/\*]+/)
111
- tokens << token
112
-
113
- # Common operators
114
- elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
115
- tokens << token
116
-
117
- else
118
- s.getch
119
- end
120
- end
121
-
122
- tokens
123
- end
124
-
125
- # Internal: Extract normalized shebang command token.
126
- #
127
- # Examples
128
- #
129
- # extract_shebang("#!/usr/bin/ruby")
130
- # # => "ruby"
131
- #
132
- # extract_shebang("#!/usr/bin/env node")
133
- # # => "node"
134
- #
135
- # extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
136
- # # => "awk"
137
- #
138
- # Returns String token or nil it couldn't be parsed.
139
- def extract_shebang(data)
140
- s = StringScanner.new(data)
141
-
142
- if path = s.scan(/^#!\s*\S+/)
143
- script = path.split('/').last
144
- if script == 'env'
145
- s.scan(/\s+/)
146
- s.scan(/.*=[^\s]+\s+/)
147
- script = s.scan(/\S+/)
148
- end
149
- script = script[/[^\d]+/, 0] if script
150
- return script
151
- end
152
-
153
- nil
154
- end
155
-
156
- # Internal: Extract tokens from inside SGML tag.
157
- #
158
- # data - SGML tag String.
159
- #
160
- # Examples
161
- #
162
- # extract_sgml_tokens("<a href='' class=foo>")
163
- # # => ["<a>", "href="]
164
- #
165
- # Returns Array of token Strings.
166
- def extract_sgml_tokens(data)
167
- s = StringScanner.new(data)
168
-
169
- tokens = []
170
-
171
- until s.eos?
172
- # Emit start token
173
- if token = s.scan(/<\/?[^\s>]+/)
174
- tokens << "#{token}>"
175
-
176
- # Emit attributes with trailing =
177
- elsif token = s.scan(/\w+=/)
178
- tokens << token
179
-
180
- # Then skip over attribute value
181
- if s.scan(/"/)
182
- s.skip_until(/[^\\]"/)
183
- elsif s.scan(/'/)
184
- s.skip_until(/[^\\]'/)
185
- else
186
- s.skip_until(/\w+/)
187
- end
188
-
189
- # Emit lone attributes
190
- elsif token = s.scan(/\w+/)
191
- tokens << token
192
-
193
- # Stop at the end of the tag
194
- elsif s.scan(/>/)
195
- s.terminate
196
-
197
- else
198
- s.getch
199
- end
200
- end
201
-
202
- tokens
203
- end
204
19
  end
205
20
  end
@@ -1,3 +1,3 @@
1
1
  module Linguist
2
- VERSION = "5.3.1"
2
+ VERSION = "5.3.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github-linguist
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.3.1
4
+ version: 5.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - GitHub
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-17 00:00:00.000000000 Z
11
+ date: 2017-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: charlock_holmes
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '5.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake-compiler
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.9'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.9'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: mocha
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -199,12 +213,19 @@ email:
199
213
  executables:
200
214
  - linguist
201
215
  - git-linguist
202
- extensions: []
216
+ extensions:
217
+ - ext/linguist/extconf.rb
203
218
  extra_rdoc_files: []
204
219
  files:
205
220
  - LICENSE
206
221
  - bin/git-linguist
207
222
  - bin/linguist
223
+ - ext/linguist/extconf.rb
224
+ - ext/linguist/lex.linguist_yy.c
225
+ - ext/linguist/lex.linguist_yy.h
226
+ - ext/linguist/linguist.c
227
+ - ext/linguist/linguist.h
228
+ - ext/linguist/tokenizer.l
208
229
  - grammars/annotation.liquidhaskell.haskell.json
209
230
  - grammars/config.xcompose.json
210
231
  - grammars/file.lasso.json
@@ -651,6 +672,7 @@ files:
651
672
  - lib/linguist/languages.json
652
673
  - lib/linguist/languages.yml
653
674
  - lib/linguist/lazy_blob.rb
675
+ - lib/linguist/linguist.bundle
654
676
  - lib/linguist/md5.rb
655
677
  - lib/linguist/popular.yml
656
678
  - lib/linguist/repository.rb