github-linguist 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/linguist/blob_helper.rb +10 -2
- data/lib/linguist/classifier.rb +183 -0
- data/lib/linguist/classifier.yml +19013 -0
- data/lib/linguist/language.rb +1 -1
- data/lib/linguist/sample.rb +74 -0
- data/lib/linguist/tokenizer.rb +157 -0
- metadata +22 -4
data/lib/linguist/language.rb
CHANGED
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'linguist/classifier'
|
2
|
+
require 'linguist/language'
|
3
|
+
|
4
|
+
module Linguist
|
5
|
+
# Model for accessing classifier training data.
|
6
|
+
class Sample
|
7
|
+
# Samples live in test/ for now, we'll eventually move them out
|
8
|
+
PATH = File.expand_path("../../../test/fixtures", __FILE__)
|
9
|
+
|
10
|
+
# Public: Iterate over each Sample.
|
11
|
+
#
|
12
|
+
# &block - Yields Sample to block
|
13
|
+
#
|
14
|
+
# Returns nothing.
|
15
|
+
def self.each(&block)
|
16
|
+
Dir.entries(PATH).each do |category|
|
17
|
+
next if category == '.' || category == '..'
|
18
|
+
|
19
|
+
# Skip text and binary for now
|
20
|
+
# Possibly reconsider this later
|
21
|
+
next if category == 'text' || category == 'binary'
|
22
|
+
|
23
|
+
# Map directory name to a Language alias
|
24
|
+
language = Linguist::Language.find_by_alias(category)
|
25
|
+
raise "No language for #{category.inspect}" unless language
|
26
|
+
|
27
|
+
dirname = File.join(PATH, category)
|
28
|
+
Dir.entries(dirname).each do |filename|
|
29
|
+
next if filename == '.' || filename == '..'
|
30
|
+
yield new(File.join(dirname, filename), language)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
# Public: Build Classifier from all samples.
|
38
|
+
#
|
39
|
+
# Returns trained Classifier.
|
40
|
+
def self.classifier
|
41
|
+
classifier = Classifier.new
|
42
|
+
each { |sample| classifier.train(sample.language, sample.data) }
|
43
|
+
classifier.gc
|
44
|
+
end
|
45
|
+
|
46
|
+
# Internal: Initialize Sample.
|
47
|
+
#
|
48
|
+
# Samples should be initialized by Sample.each.
|
49
|
+
#
|
50
|
+
# path - String full path to file.
|
51
|
+
# language - Language of sample.
|
52
|
+
def initialize(path, language)
|
53
|
+
@path = path
|
54
|
+
@language = language
|
55
|
+
end
|
56
|
+
|
57
|
+
# Public: Get full path to file.
|
58
|
+
#
|
59
|
+
# Returns String.
|
60
|
+
attr_reader :path
|
61
|
+
|
62
|
+
# Public: Get sample language.
|
63
|
+
#
|
64
|
+
# Returns Language.
|
65
|
+
attr_reader :language
|
66
|
+
|
67
|
+
# Public: Read file contents.
|
68
|
+
#
|
69
|
+
# Returns String.
|
70
|
+
def data
|
71
|
+
File.read(path)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Linguist
|
2
|
+
# Generic programming language tokenizer.
|
3
|
+
#
|
4
|
+
# Tokens are designed for use in the language bayes classifier.
|
5
|
+
# It strips any data strings or comments and preserves significant
|
6
|
+
# language symbols.
|
7
|
+
class Tokenizer
|
8
|
+
# Public: Initialize a Tokenizer.
|
9
|
+
#
|
10
|
+
# data - String data to scan.
|
11
|
+
def initialize(data)
|
12
|
+
@data = data
|
13
|
+
end
|
14
|
+
|
15
|
+
# Public: Get source data.
|
16
|
+
#
|
17
|
+
# Returns String.
|
18
|
+
attr_reader :data
|
19
|
+
|
20
|
+
# Public: Extract tokens from data.
|
21
|
+
#
|
22
|
+
# Returns Array of token Strings.
|
23
|
+
def tokens
|
24
|
+
extract_tokens(data)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Internal: Extract generic tokens from data.
|
28
|
+
#
|
29
|
+
# data - String to scan.
|
30
|
+
#
|
31
|
+
# Examples
|
32
|
+
#
|
33
|
+
# extract_tokens("printf('Hello')")
|
34
|
+
# # => ['printf', '(', ')']
|
35
|
+
#
|
36
|
+
# Returns Array of token Strings.
|
37
|
+
def extract_tokens(data)
|
38
|
+
s = StringScanner.new(data)
|
39
|
+
|
40
|
+
tokens = []
|
41
|
+
until s.eos?
|
42
|
+
# Ruby single line comment
|
43
|
+
if token = s.scan(/# /)
|
44
|
+
tokens << "#"
|
45
|
+
s.skip_until(/\n|\Z/)
|
46
|
+
|
47
|
+
# C style single line comment
|
48
|
+
elsif token = s.scan(/\/\/ /)
|
49
|
+
tokens << "//"
|
50
|
+
s.skip_until(/\n|\Z/)
|
51
|
+
|
52
|
+
# Leading Tex or Matlab comments
|
53
|
+
elsif token = s.scan(/\n%/)
|
54
|
+
tokens << "%"
|
55
|
+
s.skip_until(/\n|\Z/)
|
56
|
+
|
57
|
+
# C multiline comments
|
58
|
+
elsif token = s.scan(/\/\*/)
|
59
|
+
tokens << "/*"
|
60
|
+
s.skip_until(/\*\//)
|
61
|
+
tokens << "*/"
|
62
|
+
|
63
|
+
# Haskell multiline comments
|
64
|
+
elsif token = s.scan(/\{-/)
|
65
|
+
tokens << "{-"
|
66
|
+
s.skip_until(/-\}/)
|
67
|
+
tokens << "-}"
|
68
|
+
|
69
|
+
# XML multiline comments
|
70
|
+
elsif token = s.scan(/<!--/)
|
71
|
+
tokens << "<!--"
|
72
|
+
s.skip_until(/-->/)
|
73
|
+
tokens << "-->"
|
74
|
+
|
75
|
+
# Skip single or double quoted strings
|
76
|
+
elsif s.scan(/"/)
|
77
|
+
s.skip_until(/[^\\]"/)
|
78
|
+
elsif s.scan(/'/)
|
79
|
+
s.skip_until(/[^\\]'/)
|
80
|
+
|
81
|
+
# Skip number literals
|
82
|
+
elsif s.scan(/(0x)?\d+/)
|
83
|
+
|
84
|
+
# SGML style brackets
|
85
|
+
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
86
|
+
extract_sgml_tokens(token).each { |t| tokens << t }
|
87
|
+
|
88
|
+
# Common programming punctuation
|
89
|
+
elsif token = s.scan(/;|\{|\}|\(|\)/)
|
90
|
+
tokens << token
|
91
|
+
|
92
|
+
# Regular token
|
93
|
+
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
94
|
+
tokens << token
|
95
|
+
|
96
|
+
# Common operators
|
97
|
+
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
98
|
+
tokens << token
|
99
|
+
|
100
|
+
else
|
101
|
+
s.getch
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
tokens
|
106
|
+
end
|
107
|
+
|
108
|
+
# Internal: Extract tokens from inside SGML tag.
|
109
|
+
#
|
110
|
+
# data - SGML tag String.
|
111
|
+
#
|
112
|
+
# Examples
|
113
|
+
#
|
114
|
+
# extract_sgml_tokens("<a href='' class=foo>")
|
115
|
+
# # => ["<a>", "href="]
|
116
|
+
#
|
117
|
+
# Returns Array of token Strings.
|
118
|
+
def extract_sgml_tokens(data)
|
119
|
+
s = StringScanner.new(data)
|
120
|
+
|
121
|
+
tokens = []
|
122
|
+
|
123
|
+
until s.eos?
|
124
|
+
# Emit start token
|
125
|
+
if token = s.scan(/<\/?[^\s>]+/)
|
126
|
+
tokens << "#{token}>"
|
127
|
+
|
128
|
+
# Emit attributes with trailing =
|
129
|
+
elsif token = s.scan(/\w+=/)
|
130
|
+
tokens << token
|
131
|
+
|
132
|
+
# Then skip over attribute value
|
133
|
+
if s.scan(/"/)
|
134
|
+
s.skip_until(/[^\\]"/)
|
135
|
+
elsif s.scan(/'/)
|
136
|
+
s.skip_until(/[^\\]'/)
|
137
|
+
else
|
138
|
+
s.skip_until(/\w+/)
|
139
|
+
end
|
140
|
+
|
141
|
+
# Emit lone attributes
|
142
|
+
elsif token = s.scan(/\w+/)
|
143
|
+
tokens << token
|
144
|
+
|
145
|
+
# Stop at the end of the tag
|
146
|
+
elsif s.scan(/>/)
|
147
|
+
s.terminate
|
148
|
+
|
149
|
+
else
|
150
|
+
s.getch
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
tokens
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: github-linguist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
|
-
-
|
7
|
+
- 2
|
8
8
|
- 0
|
9
9
|
- 0
|
10
|
-
version:
|
10
|
+
version: 2.0.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- GitHub
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
type: :runtime
|
82
82
|
version_requirements: *id004
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: json
|
85
85
|
prerelease: false
|
86
86
|
requirement: &id005 !ruby/object:Gem::Requirement
|
87
87
|
none: false
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
version: "0"
|
95
95
|
type: :development
|
96
96
|
version_requirements: *id005
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rake
|
99
|
+
prerelease: false
|
100
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
hash: 3
|
106
|
+
segments:
|
107
|
+
- 0
|
108
|
+
version: "0"
|
109
|
+
type: :development
|
110
|
+
version_requirements: *id006
|
97
111
|
description:
|
98
112
|
email:
|
99
113
|
executables:
|
@@ -104,6 +118,8 @@ extra_rdoc_files: []
|
|
104
118
|
|
105
119
|
files:
|
106
120
|
- lib/linguist/blob_helper.rb
|
121
|
+
- lib/linguist/classifier.rb
|
122
|
+
- lib/linguist/classifier.yml
|
107
123
|
- lib/linguist/file_blob.rb
|
108
124
|
- lib/linguist/language.rb
|
109
125
|
- lib/linguist/languages.yml
|
@@ -112,6 +128,8 @@ files:
|
|
112
128
|
- lib/linguist/pathname.rb
|
113
129
|
- lib/linguist/popular.yml
|
114
130
|
- lib/linguist/repository.rb
|
131
|
+
- lib/linguist/sample.rb
|
132
|
+
- lib/linguist/tokenizer.rb
|
115
133
|
- lib/linguist/vendor.yml
|
116
134
|
- lib/linguist.rb
|
117
135
|
- bin/linguist
|