tongue 0.2.10.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,149 @@
1
+ begin
2
+ require 'json'
3
+ rescue LoadError
4
+ require 'yaml'
5
+ end
6
+
7
+ require 'linguist/md5'
8
+ require 'linguist/classifier'
9
+
10
+ module Linguist
11
+ # Model for accessing classifier training data.
12
+ module Samples
13
+ # Path to samples root directory
14
+ ROOT = File.expand_path("../../../samples", __FILE__)
15
+
16
+ # Path for serialized samples db
17
+ PATH = File.expand_path('../samples.json', __FILE__)
18
+
19
+ # Hash of serialized samples object
20
+ if File.exist?(PATH)
21
+ serializer = defined?(JSON) ? JSON : YAML
22
+ DATA = serializer.load(File.read(PATH))
23
+ end
24
+
25
+ # Public: Iterate over each sample.
26
+ #
27
+ # &block - Yields Sample to block
28
+ #
29
+ # Returns nothing.
30
+ def self.each(&block)
31
+ Dir.entries(ROOT).each do |category|
32
+ next if category == '.' || category == '..'
33
+
34
+ # Skip text and binary for now
35
+ # Possibly reconsider this later
36
+ next if category == 'Text' || category == 'Binary'
37
+
38
+ dirname = File.join(ROOT, category)
39
+ Dir.entries(dirname).each do |filename|
40
+ next if filename == '.' || filename == '..'
41
+
42
+ if filename == 'filenames'
43
+ Dir.entries(File.join(dirname, filename)).each do |subfilename|
44
+ next if subfilename == '.' || subfilename == '..'
45
+
46
+ yield({
47
+ :path => File.join(dirname, filename, subfilename),
48
+ :language => category,
49
+ :filename => subfilename
50
+ })
51
+ end
52
+ else
53
+ if File.extname(filename) == ""
54
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
55
+ end
56
+
57
+ yield({
58
+ :path => File.join(dirname, filename),
59
+ :language => category,
60
+ :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
61
+ :extname => File.extname(filename)
62
+ })
63
+ end
64
+ end
65
+ end
66
+
67
+ nil
68
+ end
69
+
70
+ # Public: Build Classifier from all samples.
71
+ #
72
+ # Returns trained Classifier.
73
+ def self.data
74
+ db = {}
75
+ db['extnames'] = {}
76
+ db['interpreters'] = {}
77
+ db['filenames'] = {}
78
+
79
+ each do |sample|
80
+ language_name = sample[:language]
81
+
82
+ if sample[:extname]
83
+ db['extnames'][language_name] ||= []
84
+ if !db['extnames'][language_name].include?(sample[:extname])
85
+ db['extnames'][language_name] << sample[:extname]
86
+ db['extnames'][language_name].sort!
87
+ end
88
+ end
89
+
90
+ if sample[:interpreter]
91
+ db['interpreters'][language_name] ||= []
92
+ if !db['interpreters'][language_name].include?(sample[:interpreter])
93
+ db['interpreters'][language_name] << sample[:interpreter]
94
+ db['interpreters'][language_name].sort!
95
+ end
96
+ end
97
+
98
+ if sample[:filename]
99
+ db['filenames'][language_name] ||= []
100
+ db['filenames'][language_name] << sample[:filename]
101
+ db['filenames'][language_name].sort!
102
+ end
103
+
104
+ data = File.read(sample[:path])
105
+ Classifier.train!(db, language_name, data)
106
+ end
107
+
108
+ db['md5'] = Linguist::MD5.hexdigest(db)
109
+
110
+ db
111
+ end
112
+ end
113
+
114
+ # Used to retrieve the interpreter from the shebang line of a file's
115
+ # data.
116
+ def self.interpreter_from_shebang(data)
117
+ lines = data.lines.to_a
118
+
119
+ if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
120
+ bang.sub!(/^#! /, '#!')
121
+ tokens = bang.split(' ')
122
+ pieces = tokens.first.split('/')
123
+
124
+ if pieces.size > 1
125
+ script = pieces.last
126
+ else
127
+ script = pieces.first.sub('#!', '')
128
+ end
129
+
130
+ script = script == 'env' ? tokens[1] : script
131
+
132
+ # "python2.6" -> "python"
133
+ if script =~ /((?:\d+\.?)+)/
134
+ script.sub! $1, ''
135
+ end
136
+
137
+ # Check for multiline shebang hacks that call `exec`
138
+ if script == 'sh' &&
139
+ lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
140
+ script = $1
141
+ end
142
+
143
+ script
144
+ else
145
+ nil
146
+ end
147
+ end
148
+
149
+ end
@@ -0,0 +1,198 @@
1
+ require 'strscan'
2
+
3
+ module Linguist
4
+ # Generic programming language tokenizer.
5
+ #
6
+ # Tokens are designed for use in the language bayes classifier.
7
+ # It strips any data strings or comments and preserves significant
8
+ # language symbols.
9
+ class Tokenizer
10
+ # Public: Extract tokens from data
11
+ #
12
+ # data - String to tokenize
13
+ #
14
+ # Returns Array of token Strings.
15
+ def self.tokenize(data)
16
+ new.extract_tokens(data)
17
+ end
18
+
19
+ # Read up to 100KB
20
+ BYTE_LIMIT = 100_000
21
+
22
+ # Start state on token, ignore anything till the next newline
23
+ SINGLE_LINE_COMMENTS = [
24
+ '//', # C
25
+ '#', # Ruby
26
+ '%', # Tex
27
+ ]
28
+
29
+ # Start state on opening token, ignore anything until the closing
30
+ # token is reached.
31
+ MULTI_LINE_COMMENTS = [
32
+ ['/*', '*/'], # C
33
+ ['<!--', '-->'], # XML
34
+ ['{-', '-}'], # Haskell
35
+ ['(*', '*)'], # Coq
36
+ ['"""', '"""'] # Python
37
+ ]
38
+
39
+ START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
40
+ "\s*#{Regexp.escape(c)} "
41
+ }.join("|"))
42
+
43
+ START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
44
+ Regexp.escape(c[0])
45
+ }.join("|"))
46
+
47
+ # Internal: Extract generic tokens from data.
48
+ #
49
+ # data - String to scan.
50
+ #
51
+ # Examples
52
+ #
53
+ # extract_tokens("printf('Hello')")
54
+ # # => ['printf', '(', ')']
55
+ #
56
+ # Returns Array of token Strings.
57
+ def extract_tokens(data)
58
+ s = StringScanner.new(data)
59
+
60
+ tokens = []
61
+ until s.eos?
62
+ break if s.pos >= BYTE_LIMIT
63
+
64
+ if token = s.scan(/^#!.+$/)
65
+ if name = extract_shebang(token)
66
+ tokens << "SHEBANG#!#{name}"
67
+ end
68
+
69
+ # Single line comment
70
+ elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
71
+ # tokens << token.strip
72
+ s.skip_until(/\n|\Z/)
73
+
74
+ # Multiline comments
75
+ elsif token = s.scan(START_MULTI_LINE_COMMENT)
76
+ # tokens << token
77
+ close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
78
+ s.skip_until(Regexp.compile(Regexp.escape(close_token)))
79
+ # tokens << close_token
80
+
81
+ # Skip single or double quoted strings
82
+ elsif s.scan(/"/)
83
+ if s.peek(1) == "\""
84
+ s.getch
85
+ else
86
+ s.skip_until(/[^\\]"/)
87
+ end
88
+ elsif s.scan(/'/)
89
+ if s.peek(1) == "'"
90
+ s.getch
91
+ else
92
+ s.skip_until(/[^\\]'/)
93
+ end
94
+
95
+ # Skip number literals
96
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
97
+
98
+ # SGML style brackets
99
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
100
+ extract_sgml_tokens(token).each { |t| tokens << t }
101
+
102
+ # Common programming punctuation
103
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
104
+ tokens << token
105
+
106
+ # Regular token
107
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
108
+ tokens << token
109
+
110
+ # Common operators
111
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
112
+ tokens << token
113
+
114
+ else
115
+ s.getch
116
+ end
117
+ end
118
+
119
+ tokens
120
+ end
121
+
122
+ # Internal: Extract normalized shebang command token.
123
+ #
124
+ # Examples
125
+ #
126
+ # extract_shebang("#!/usr/bin/ruby")
127
+ # # => "ruby"
128
+ #
129
+ # extract_shebang("#!/usr/bin/env node")
130
+ # # => "node"
131
+ #
132
+ # Returns String token or nil it couldn't be parsed.
133
+ def extract_shebang(data)
134
+ s = StringScanner.new(data)
135
+
136
+ if path = s.scan(/^#!\s*\S+/)
137
+ script = path.split('/').last
138
+ if script == 'env'
139
+ s.scan(/\s+/)
140
+ script = s.scan(/\S+/)
141
+ end
142
+ script = script[/[^\d]+/, 0] if script
143
+ return script
144
+ end
145
+
146
+ nil
147
+ end
148
+
149
+ # Internal: Extract tokens from inside SGML tag.
150
+ #
151
+ # data - SGML tag String.
152
+ #
153
+ # Examples
154
+ #
155
+ # extract_sgml_tokens("<a href='' class=foo>")
156
+ # # => ["<a>", "href="]
157
+ #
158
+ # Returns Array of token Strings.
159
+ def extract_sgml_tokens(data)
160
+ s = StringScanner.new(data)
161
+
162
+ tokens = []
163
+
164
+ until s.eos?
165
+ # Emit start token
166
+ if token = s.scan(/<\/?[^\s>]+/)
167
+ tokens << "#{token}>"
168
+
169
+ # Emit attributes with trailing =
170
+ elsif token = s.scan(/\w+=/)
171
+ tokens << token
172
+
173
+ # Then skip over attribute value
174
+ if s.scan(/"/)
175
+ s.skip_until(/[^\\]"/)
176
+ elsif s.scan(/'/)
177
+ s.skip_until(/[^\\]'/)
178
+ else
179
+ s.skip_until(/\w+/)
180
+ end
181
+
182
+ # Emit lone attributes
183
+ elsif token = s.scan(/\w+/)
184
+ tokens << token
185
+
186
+ # Stop at the end of the tag
187
+ elsif s.scan(/>/)
188
+ s.terminate
189
+
190
+ else
191
+ s.getch
192
+ end
193
+ end
194
+
195
+ tokens
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,167 @@
1
+ # Vendored files and directories are excluded from language
2
+ # statistics.
3
+ #
4
+ # Lines in this file are Regexps that are matched against the file
5
+ # pathname.
6
+ #
7
+ # Please add additional test coverage to
8
+ # `test/test_blob.rb#test_vendored` if you make any changes.
9
+
10
+ ## Vendor Conventions ##
11
+
12
+ # Caches
13
+ - cache/
14
+
15
+ # Dependencies
16
+ - ^[Dd]ependencies/
17
+
18
+ # C deps
19
+ # https://github.com/joyent/node
20
+ - ^deps/
21
+ - ^tools/
22
+ - (^|/)configure$
23
+ - (^|/)configure.ac$
24
+ - (^|/)config.guess$
25
+ - (^|/)config.sub$
26
+
27
+ # Node dependencies
28
+ - node_modules/
29
+
30
+ # Bower Components
31
+ - bower_components/
32
+
33
+ # Erlang bundles
34
+ - ^rebar$
35
+
36
+ # Bootstrap minified css and js
37
+ - (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
38
+
39
+ # Vendored dependencies
40
+ - thirdparty/
41
+ - vendors?/
42
+
43
+ # Debian packaging
44
+ - ^debian/
45
+
46
+ ## Commonly Bundled JavaScript frameworks ##
47
+
48
+ # jQuery
49
+ - (^|/)jquery([^.]*)(\.min)?\.js$
50
+ - (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$
51
+
52
+ # jQuery UI
53
+ - (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
54
+ - (^|/)jquery\.(ui|effects)\.([^.]*)(\.min)?\.(js|css)$
55
+
56
+ # Prototype
57
+ - (^|/)prototype(.*)\.js$
58
+ - (^|/)effects\.js$
59
+ - (^|/)controls\.js$
60
+ - (^|/)dragdrop\.js$
61
+
62
+ # MooTools
63
+ - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
64
+
65
+ # Dojo
66
+ - (^|/)dojo\.js$
67
+
68
+ # MochiKit
69
+ - (^|/)MochiKit\.js$
70
+
71
+ # YUI
72
+ - (^|/)yahoo-([^.]*)\.js$
73
+ - (^|/)yui([^.]*)\.js$
74
+
75
+ # WYS editors
76
+ - (^|/)ckeditor\.js$
77
+ - (^|/)tiny_mce([^.]*)\.js$
78
+ - (^|/)tiny_mce/(langs|plugins|themes|utils)
79
+
80
+ # MathJax
81
+ - (^|/)MathJax/
82
+
83
+ # SyntaxHighlighter - http://alexgorbatchev.com/
84
+ - (^|/)shBrush([^.]*)\.js$
85
+ - (^|/)shCore\.js$
86
+ - (^|/)shLegacy\.js$
87
+
88
+ # AngularJS
89
+ - (^|/)angular([^.]*)(\.min)?\.js$
90
+
91
+ ## Python ##
92
+
93
+ # django
94
+ - (^|/)admin_media/
95
+
96
+ # Fabric
97
+ - ^fabfile\.py$
98
+
99
+ # WAF
100
+ - ^waf$
101
+
102
+ # .osx
103
+ - ^.osx$
104
+
105
+ ## Obj-C ##
106
+
107
+ # Sparkle
108
+ - (^|/)Sparkle/
109
+
110
+ ## .NET ##
111
+
112
+ # Visual Studio IntelliSense
113
+ - -vsdoc\.js$
114
+
115
+ # jQuery validation plugin (MS bundles this with asp.net mvc)
116
+ - (^|/)jquery([^.]*)\.validate(\.unobtrusive)?(\.min)?\.js$
117
+ - (^|/)jquery([^.]*)\.unobtrusive\-ajax(\.min)?\.js$
118
+
119
+ # Microsoft Ajax
120
+ - (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
121
+
122
+ # NuGet
123
+ - ^[Pp]ackages/
124
+
125
+ # ExtJS
126
+ - (^|/)extjs/.*?\.js$
127
+ - (^|/)extjs/.*?\.xml$
128
+ - (^|/)extjs/.*?\.txt$
129
+ - (^|/)extjs/.*?\.html$
130
+ - (^|/)extjs/.*?\.properties$
131
+ - (^|/)extjs/.sencha/
132
+ - (^|/)extjs/docs/
133
+ - (^|/)extjs/builds/
134
+ - (^|/)extjs/cmd/
135
+ - (^|/)extjs/examples/
136
+ - (^|/)extjs/locale/
137
+ - (^|/)extjs/packages/
138
+ - (^|/)extjs/plugins/
139
+ - (^|/)extjs/resources/
140
+ - (^|/)extjs/src/
141
+ - (^|/)extjs/welcome/
142
+
143
+ # Samples folders
144
+ - ^[Ss]amples/
145
+
146
+ # LICENSE, README, git config files
147
+ - ^COPYING$
148
+ - LICENSE$
149
+ - License$
150
+ - gitattributes$
151
+ - gitignore$
152
+ - gitmodules$
153
+ - ^README$
154
+ - ^readme$
155
+
156
+ # Test fixtures
157
+ - ^[Tt]est/fixtures/
158
+
159
+ # PhoneGap/Cordova
160
+ - (^|/)cordova([^.]*)(\.min)?\.js$
161
+ - (^|/)cordova\-\d\.\d(\.\d)?(\.min)?\.js$
162
+
163
+ # Vagrant
164
+ - ^Vagrantfile$
165
+
166
+ # .DS_Store's
167
+ - .[Dd][Ss]_[Ss]tore$