tongue 0.2.10.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ begin
2
+ require 'json'
3
+ rescue LoadError
4
+ require 'yaml'
5
+ end
6
+
7
+ require 'linguist/md5'
8
+ require 'linguist/classifier'
9
+
10
+ module Linguist
11
+ # Model for accessing classifier training data.
12
+ module Samples
13
+ # Path to samples root directory
14
+ ROOT = File.expand_path("../../../samples", __FILE__)
15
+
16
+ # Path for serialized samples db
17
+ PATH = File.expand_path('../samples.json', __FILE__)
18
+
19
+ # Hash of serialized samples object
20
+ if File.exist?(PATH)
21
+ serializer = defined?(JSON) ? JSON : YAML
22
+ DATA = serializer.load(File.read(PATH))
23
+ end
24
+
25
+ # Public: Iterate over each sample.
26
+ #
27
+ # &block - Yields Sample to block
28
+ #
29
+ # Returns nothing.
30
+ def self.each(&block)
31
+ Dir.entries(ROOT).each do |category|
32
+ next if category == '.' || category == '..'
33
+
34
+ # Skip text and binary for now
35
+ # Possibly reconsider this later
36
+ next if category == 'Text' || category == 'Binary'
37
+
38
+ dirname = File.join(ROOT, category)
39
+ Dir.entries(dirname).each do |filename|
40
+ next if filename == '.' || filename == '..'
41
+
42
+ if filename == 'filenames'
43
+ Dir.entries(File.join(dirname, filename)).each do |subfilename|
44
+ next if subfilename == '.' || subfilename == '..'
45
+
46
+ yield({
47
+ :path => File.join(dirname, filename, subfilename),
48
+ :language => category,
49
+ :filename => subfilename
50
+ })
51
+ end
52
+ else
53
+ if File.extname(filename) == ""
54
+ raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
55
+ end
56
+
57
+ yield({
58
+ :path => File.join(dirname, filename),
59
+ :language => category,
60
+ :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
61
+ :extname => File.extname(filename)
62
+ })
63
+ end
64
+ end
65
+ end
66
+
67
+ nil
68
+ end
69
+
70
+ # Public: Build Classifier from all samples.
71
+ #
72
+ # Returns trained Classifier.
73
+ def self.data
74
+ db = {}
75
+ db['extnames'] = {}
76
+ db['interpreters'] = {}
77
+ db['filenames'] = {}
78
+
79
+ each do |sample|
80
+ language_name = sample[:language]
81
+
82
+ if sample[:extname]
83
+ db['extnames'][language_name] ||= []
84
+ if !db['extnames'][language_name].include?(sample[:extname])
85
+ db['extnames'][language_name] << sample[:extname]
86
+ db['extnames'][language_name].sort!
87
+ end
88
+ end
89
+
90
+ if sample[:interpreter]
91
+ db['interpreters'][language_name] ||= []
92
+ if !db['interpreters'][language_name].include?(sample[:interpreter])
93
+ db['interpreters'][language_name] << sample[:interpreter]
94
+ db['interpreters'][language_name].sort!
95
+ end
96
+ end
97
+
98
+ if sample[:filename]
99
+ db['filenames'][language_name] ||= []
100
+ db['filenames'][language_name] << sample[:filename]
101
+ db['filenames'][language_name].sort!
102
+ end
103
+
104
+ data = File.read(sample[:path])
105
+ Classifier.train!(db, language_name, data)
106
+ end
107
+
108
+ db['md5'] = Linguist::MD5.hexdigest(db)
109
+
110
+ db
111
+ end
112
+ end
113
+
114
+ # Used to retrieve the interpreter from the shebang line of a file's
115
+ # data.
116
+ def self.interpreter_from_shebang(data)
117
+ lines = data.lines.to_a
118
+
119
+ if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
120
+ bang.sub!(/^#! /, '#!')
121
+ tokens = bang.split(' ')
122
+ pieces = tokens.first.split('/')
123
+
124
+ if pieces.size > 1
125
+ script = pieces.last
126
+ else
127
+ script = pieces.first.sub('#!', '')
128
+ end
129
+
130
+ script = script == 'env' ? tokens[1] : script
131
+
132
+ # "python2.6" -> "python"
133
+ if script =~ /((?:\d+\.?)+)/
134
+ script.sub! $1, ''
135
+ end
136
+
137
+ # Check for multiline shebang hacks that call `exec`
138
+ if script == 'sh' &&
139
+ lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
140
+ script = $1
141
+ end
142
+
143
+ script
144
+ else
145
+ nil
146
+ end
147
+ end
148
+
149
+ end
@@ -0,0 +1,198 @@
1
+ require 'strscan'
2
+
3
+ module Linguist
4
+ # Generic programming language tokenizer.
5
+ #
6
+ # Tokens are designed for use in the language bayes classifier.
7
+ # It strips any data strings or comments and preserves significant
8
+ # language symbols.
9
+ class Tokenizer
10
+ # Public: Extract tokens from data
11
+ #
12
+ # data - String to tokenize
13
+ #
14
+ # Returns Array of token Strings.
15
+ def self.tokenize(data)
16
+ new.extract_tokens(data)
17
+ end
18
+
19
+ # Read up to 100KB
20
+ BYTE_LIMIT = 100_000
21
+
22
+ # Start state on token, ignore anything till the next newline
23
+ SINGLE_LINE_COMMENTS = [
24
+ '//', # C
25
+ '#', # Ruby
26
+ '%', # Tex
27
+ ]
28
+
29
+ # Start state on opening token, ignore anything until the closing
30
+ # token is reached.
31
+ MULTI_LINE_COMMENTS = [
32
+ ['/*', '*/'], # C
33
+ ['<!--', '-->'], # XML
34
+ ['{-', '-}'], # Haskell
35
+ ['(*', '*)'], # Coq
36
+ ['"""', '"""'] # Python
37
+ ]
38
+
39
+ START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
40
+ "\s*#{Regexp.escape(c)} "
41
+ }.join("|"))
42
+
43
+ START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
44
+ Regexp.escape(c[0])
45
+ }.join("|"))
46
+
47
+ # Internal: Extract generic tokens from data.
48
+ #
49
+ # data - String to scan.
50
+ #
51
+ # Examples
52
+ #
53
+ # extract_tokens("printf('Hello')")
54
+ # # => ['printf', '(', ')']
55
+ #
56
+ # Returns Array of token Strings.
57
+ def extract_tokens(data)
58
+ s = StringScanner.new(data)
59
+
60
+ tokens = []
61
+ until s.eos?
62
+ break if s.pos >= BYTE_LIMIT
63
+
64
+ if token = s.scan(/^#!.+$/)
65
+ if name = extract_shebang(token)
66
+ tokens << "SHEBANG#!#{name}"
67
+ end
68
+
69
+ # Single line comment
70
+ elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
71
+ # tokens << token.strip
72
+ s.skip_until(/\n|\Z/)
73
+
74
+ # Multiline comments
75
+ elsif token = s.scan(START_MULTI_LINE_COMMENT)
76
+ # tokens << token
77
+ close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
78
+ s.skip_until(Regexp.compile(Regexp.escape(close_token)))
79
+ # tokens << close_token
80
+
81
+ # Skip single or double quoted strings
82
+ elsif s.scan(/"/)
83
+ if s.peek(1) == "\""
84
+ s.getch
85
+ else
86
+ s.skip_until(/[^\\]"/)
87
+ end
88
+ elsif s.scan(/'/)
89
+ if s.peek(1) == "'"
90
+ s.getch
91
+ else
92
+ s.skip_until(/[^\\]'/)
93
+ end
94
+
95
+ # Skip number literals
96
+ elsif s.scan(/(0x)?\d(\d|\.)*/)
97
+
98
+ # SGML style brackets
99
+ elsif token = s.scan(/<[^\s<>][^<>]*>/)
100
+ extract_sgml_tokens(token).each { |t| tokens << t }
101
+
102
+ # Common programming punctuation
103
+ elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
104
+ tokens << token
105
+
106
+ # Regular token
107
+ elsif token = s.scan(/[\w\.@#\/\*]+/)
108
+ tokens << token
109
+
110
+ # Common operators
111
+ elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
112
+ tokens << token
113
+
114
+ else
115
+ s.getch
116
+ end
117
+ end
118
+
119
+ tokens
120
+ end
121
+
122
+ # Internal: Extract normalized shebang command token.
123
+ #
124
+ # Examples
125
+ #
126
+ # extract_shebang("#!/usr/bin/ruby")
127
+ # # => "ruby"
128
+ #
129
+ # extract_shebang("#!/usr/bin/env node")
130
+ # # => "node"
131
+ #
132
+ # Returns String token or nil it couldn't be parsed.
133
+ def extract_shebang(data)
134
+ s = StringScanner.new(data)
135
+
136
+ if path = s.scan(/^#!\s*\S+/)
137
+ script = path.split('/').last
138
+ if script == 'env'
139
+ s.scan(/\s+/)
140
+ script = s.scan(/\S+/)
141
+ end
142
+ script = script[/[^\d]+/, 0] if script
143
+ return script
144
+ end
145
+
146
+ nil
147
+ end
148
+
149
+ # Internal: Extract tokens from inside SGML tag.
150
+ #
151
+ # data - SGML tag String.
152
+ #
153
+ # Examples
154
+ #
155
+ # extract_sgml_tokens("<a href='' class=foo>")
156
+ # # => ["<a>", "href="]
157
+ #
158
+ # Returns Array of token Strings.
159
+ def extract_sgml_tokens(data)
160
+ s = StringScanner.new(data)
161
+
162
+ tokens = []
163
+
164
+ until s.eos?
165
+ # Emit start token
166
+ if token = s.scan(/<\/?[^\s>]+/)
167
+ tokens << "#{token}>"
168
+
169
+ # Emit attributes with trailing =
170
+ elsif token = s.scan(/\w+=/)
171
+ tokens << token
172
+
173
+ # Then skip over attribute value
174
+ if s.scan(/"/)
175
+ s.skip_until(/[^\\]"/)
176
+ elsif s.scan(/'/)
177
+ s.skip_until(/[^\\]'/)
178
+ else
179
+ s.skip_until(/\w+/)
180
+ end
181
+
182
+ # Emit lone attributes
183
+ elsif token = s.scan(/\w+/)
184
+ tokens << token
185
+
186
+ # Stop at the end of the tag
187
+ elsif s.scan(/>/)
188
+ s.terminate
189
+
190
+ else
191
+ s.getch
192
+ end
193
+ end
194
+
195
+ tokens
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,167 @@
1
+ # Vendored files and directories are excluded from language
2
+ # statistics.
3
+ #
4
+ # Lines in this file are Regexps that are matched against the file
5
+ # pathname.
6
+ #
7
+ # Please add additional test coverage to
8
+ # `test/test_blob.rb#test_vendored` if you make any changes.
9
+
10
+ ## Vendor Conventions ##
11
+
12
+ # Caches
13
+ - cache/
14
+
15
+ # Dependencies
16
+ - ^[Dd]ependencies/
17
+
18
+ # C deps
19
+ # https://github.com/joyent/node
20
+ - ^deps/
21
+ - ^tools/
22
+ - (^|/)configure$
23
+ - (^|/)configure.ac$
24
+ - (^|/)config.guess$
25
+ - (^|/)config.sub$
26
+
27
+ # Node dependencies
28
+ - node_modules/
29
+
30
+ # Bower Components
31
+ - bower_components/
32
+
33
+ # Erlang bundles
34
+ - ^rebar$
35
+
36
+ # Bootstrap minified css and js
37
+ - (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
38
+
39
+ # Vendored dependencies
40
+ - thirdparty/
41
+ - vendors?/
42
+
43
+ # Debian packaging
44
+ - ^debian/
45
+
46
+ ## Commonly Bundled JavaScript frameworks ##
47
+
48
+ # jQuery
49
+ - (^|/)jquery([^.]*)(\.min)?\.js$
50
+ - (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$
51
+
52
+ # jQuery UI
53
+ - (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
54
+ - (^|/)jquery\.(ui|effects)\.([^.]*)(\.min)?\.(js|css)$
55
+
56
+ # Prototype
57
+ - (^|/)prototype(.*)\.js$
58
+ - (^|/)effects\.js$
59
+ - (^|/)controls\.js$
60
+ - (^|/)dragdrop\.js$
61
+
62
+ # MooTools
63
+ - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$
64
+
65
+ # Dojo
66
+ - (^|/)dojo\.js$
67
+
68
+ # MochiKit
69
+ - (^|/)MochiKit\.js$
70
+
71
+ # YUI
72
+ - (^|/)yahoo-([^.]*)\.js$
73
+ - (^|/)yui([^.]*)\.js$
74
+
75
+ # WYS editors
76
+ - (^|/)ckeditor\.js$
77
+ - (^|/)tiny_mce([^.]*)\.js$
78
+ - (^|/)tiny_mce/(langs|plugins|themes|utils)
79
+
80
+ # MathJax
81
+ - (^|/)MathJax/
82
+
83
+ # SyntaxHighlighter - http://alexgorbatchev.com/
84
+ - (^|/)shBrush([^.]*)\.js$
85
+ - (^|/)shCore\.js$
86
+ - (^|/)shLegacy\.js$
87
+
88
+ # AngularJS
89
+ - (^|/)angular([^.]*)(\.min)?\.js$
90
+
91
+ ## Python ##
92
+
93
+ # django
94
+ - (^|/)admin_media/
95
+
96
+ # Fabric
97
+ - ^fabfile\.py$
98
+
99
+ # WAF
100
+ - ^waf$
101
+
102
+ # .osx
103
+ - ^.osx$
104
+
105
+ ## Obj-C ##
106
+
107
+ # Sparkle
108
+ - (^|/)Sparkle/
109
+
110
+ ## .NET ##
111
+
112
+ # Visual Studio IntelliSense
113
+ - -vsdoc\.js$
114
+
115
+ # jQuery validation plugin (MS bundles this with asp.net mvc)
116
+ - (^|/)jquery([^.]*)\.validate(\.unobtrusive)?(\.min)?\.js$
117
+ - (^|/)jquery([^.]*)\.unobtrusive\-ajax(\.min)?\.js$
118
+
119
+ # Microsoft Ajax
120
+ - (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
121
+
122
+ # NuGet
123
+ - ^[Pp]ackages/
124
+
125
+ # ExtJS
126
+ - (^|/)extjs/.*?\.js$
127
+ - (^|/)extjs/.*?\.xml$
128
+ - (^|/)extjs/.*?\.txt$
129
+ - (^|/)extjs/.*?\.html$
130
+ - (^|/)extjs/.*?\.properties$
131
+ - (^|/)extjs/.sencha/
132
+ - (^|/)extjs/docs/
133
+ - (^|/)extjs/builds/
134
+ - (^|/)extjs/cmd/
135
+ - (^|/)extjs/examples/
136
+ - (^|/)extjs/locale/
137
+ - (^|/)extjs/packages/
138
+ - (^|/)extjs/plugins/
139
+ - (^|/)extjs/resources/
140
+ - (^|/)extjs/src/
141
+ - (^|/)extjs/welcome/
142
+
143
+ # Samples folders
144
+ - ^[Ss]amples/
145
+
146
+ # LICENSE, README, git config files
147
+ - ^COPYING$
148
+ - LICENSE$
149
+ - License$
150
+ - gitattributes$
151
+ - gitignore$
152
+ - gitmodules$
153
+ - ^README$
154
+ - ^readme$
155
+
156
+ # Test fixtures
157
+ - ^[Tt]est/fixtures/
158
+
159
+ # PhoneGap/Cordova
160
+ - (^|/)cordova([^.]*)(\.min)?\.js$
161
+ - (^|/)cordova\-\d\.\d(\.\d)?(\.min)?\.js$
162
+
163
+ # Vagrant
164
+ - ^Vagrantfile$
165
+
166
+ # .DS_Store's
167
+ - .[Dd][Ss]_[Ss]tore$