github-linguist 2.1.2 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -45,6 +45,10 @@ module Linguist
45
45
  })
46
46
  end
47
47
  else
48
+ if File.extname(filename) == ""
49
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
50
+ end
51
+
48
52
  yield({
49
53
  :path => File.join(dirname, filename),
50
54
  :language => category,
@@ -68,18 +72,16 @@ module Linguist
68
72
  each do |sample|
69
73
  language_name = sample[:language]
70
74
 
71
- # TODO: For now skip empty extnames
72
- if sample[:extname] && sample[:extname] != ""
75
+ if sample[:extname]
73
76
  db['extnames'][language_name] ||= []
74
77
  if !db['extnames'][language_name].include?(sample[:extname])
75
78
  db['extnames'][language_name] << sample[:extname]
76
79
  end
77
80
  end
78
81
 
79
- # TODO: For now skip empty extnames
80
- if fn = sample[:filename]
82
+ if sample[:filename]
81
83
  db['filenames'][language_name] ||= []
82
- db['filenames'][language_name] << fn
84
+ db['filenames'][language_name] << sample[:filename]
83
85
  end
84
86
 
85
87
  data = File.read(sample[:path])
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module Linguist
2
4
  # Generic programming language tokenizer.
3
5
  #
@@ -50,8 +52,13 @@ module Linguist
50
52
 
51
53
  tokens = []
52
54
  until s.eos?
55
+ if token = s.scan(/^#!.+$/)
56
+ if name = extract_shebang(token)
57
+ tokens << "SHEBANG#!#{name}"
58
+ end
59
+
53
60
  # Single line comment
54
- if token = s.scan(START_SINGLE_LINE_COMMENT)
61
+ elsif token = s.scan(START_SINGLE_LINE_COMMENT)
55
62
  tokens << token.strip
56
63
  s.skip_until(/\n|\Z/)
57
64
 
@@ -64,19 +71,27 @@ module Linguist
64
71
 
65
72
  # Skip single or double quoted strings
66
73
  elsif s.scan(/"/)
67
- s.skip_until(/[^\\]"/)
74
+ if s.peek(1) == "\""
75
+ s.getch
76
+ else
77
+ s.skip_until(/[^\\]"/)
78
+ end
68
79
  elsif s.scan(/'/)
69
- s.skip_until(/[^\\]'/)
80
+ if s.peek(1) == "'"
81
+ s.getch
82
+ else
83
+ s.skip_until(/[^\\]'/)
84
+ end
70
85
 
71
86
  # Skip number literals
72
- elsif s.scan(/(0x)?\d+/)
87
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
73
88
 
74
89
  # SGML style brackets
75
90
  elsif token = s.scan(/<[^\s<>][^<>]*>/)
76
91
  extract_sgml_tokens(token).each { |t| tokens << t }
77
92
 
78
93
  # Common programming punctuation
79
- elsif token = s.scan(/;|\{|\}|\(|\)/)
94
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
80
95
  tokens << token
81
96
 
82
97
  # Regular token
@@ -95,6 +110,33 @@ module Linguist
95
110
  tokens
96
111
  end
97
112
 
113
+ # Internal: Extract normalized shebang command token.
114
+ #
115
+ # Examples
116
+ #
117
+ # extract_shebang("#!/usr/bin/ruby")
118
+ # # => "ruby"
119
+ #
120
+ # extract_shebang("#!/usr/bin/env node")
121
+ # # => "node"
122
+ #
123
+ # Returns String token or nil it couldn't be parsed.
124
+ def extract_shebang(data)
125
+ s = StringScanner.new(data)
126
+
127
+ if path = s.scan(/^#!\s*\S+/)
128
+ script = path.split('/').last
129
+ if script == 'env'
130
+ s.scan(/\s+/)
131
+ script = s.scan(/\S+/)
132
+ end
133
+ script = script[/[^\d]+/, 0]
134
+ return script
135
+ end
136
+
137
+ nil
138
+ end
139
+
98
140
  # Internal: Extract tokens from inside SGML tag.
99
141
  #
100
142
  # data - SGML tag String.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github-linguist
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.2
4
+ version: 2.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-23 00:00:00.000000000 Z
12
+ date: 2012-08-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: charlock_holmes
@@ -133,6 +133,7 @@ files:
133
133
  - lib/linguist/blob_helper.rb
134
134
  - lib/linguist/classifier.rb
135
135
  - lib/linguist/file_blob.rb
136
+ - lib/linguist/generated.rb
136
137
  - lib/linguist/language.rb
137
138
  - lib/linguist/languages.yml
138
139
  - lib/linguist/md5.rb