github-linguist 2.0.1 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/linguist +1 -1
- data/lib/linguist.rb +1 -1
- data/lib/linguist/blob_helper.rb +9 -194
- data/lib/linguist/classifier.rb +50 -111
- data/lib/linguist/language.rb +31 -16
- data/lib/linguist/languages.yml +110 -121
- data/lib/linguist/md5.rb +38 -0
- data/lib/linguist/repository.rb +1 -1
- data/lib/linguist/samples.json +20125 -0
- data/lib/linguist/samples.rb +94 -0
- data/lib/linguist/tokenizer.rb +34 -44
- metadata +21 -5
- data/lib/linguist/classifier.yml +0 -19013
- data/lib/linguist/pathname.rb +0 -92
- data/lib/linguist/sample.rb +0 -74
data/lib/linguist/pathname.rb
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
require 'linguist/language'
|
2
|
-
require 'linguist/mime'
|
3
|
-
require 'pygments'
|
4
|
-
|
5
|
-
module Linguist
|
6
|
-
# Similar to ::Pathname, Linguist::Pathname wraps a path string and
|
7
|
-
# provides helpful query methods. Its useful when you only have a
|
8
|
-
# filename but not a blob and need to figure out the language of the file.
|
9
|
-
class Pathname
|
10
|
-
# Public: Initialize a Pathname
|
11
|
-
#
|
12
|
-
# path - A filename String. The file may or maybe actually exist.
|
13
|
-
#
|
14
|
-
# Returns a Pathname.
|
15
|
-
def initialize(path)
|
16
|
-
@path = path
|
17
|
-
end
|
18
|
-
|
19
|
-
# Public: Get the basename of the path
|
20
|
-
#
|
21
|
-
# Examples
|
22
|
-
#
|
23
|
-
# Pathname.new('sub/dir/file.rb').basename
|
24
|
-
# # => 'file.rb'
|
25
|
-
#
|
26
|
-
# Returns a String.
|
27
|
-
def basename
|
28
|
-
File.basename(@path)
|
29
|
-
end
|
30
|
-
|
31
|
-
# Public: Get the extname of the path
|
32
|
-
#
|
33
|
-
# Examples
|
34
|
-
#
|
35
|
-
# Pathname.new('.rb').extname
|
36
|
-
# # => '.rb'
|
37
|
-
#
|
38
|
-
# Pathname.new('file.rb').extname
|
39
|
-
# # => '.rb'
|
40
|
-
#
|
41
|
-
# Returns a String.
|
42
|
-
def extname
|
43
|
-
File.extname(@path)
|
44
|
-
end
|
45
|
-
|
46
|
-
# Public: Get the language of the path
|
47
|
-
#
|
48
|
-
# The path extension name is the only heuristic used to detect the
|
49
|
-
# language name.
|
50
|
-
#
|
51
|
-
# Examples
|
52
|
-
#
|
53
|
-
# Pathname.new('file.rb').language
|
54
|
-
# # => Language['Ruby']
|
55
|
-
#
|
56
|
-
# Returns a Language or nil if none was found.
|
57
|
-
def language
|
58
|
-
@language ||= Language.find_by_filename(@path)
|
59
|
-
end
|
60
|
-
|
61
|
-
# Internal: Get the lexer of the path
|
62
|
-
#
|
63
|
-
# Returns a Lexer.
|
64
|
-
def lexer
|
65
|
-
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
66
|
-
end
|
67
|
-
|
68
|
-
# Public: Get the mime type
|
69
|
-
#
|
70
|
-
# Examples
|
71
|
-
#
|
72
|
-
# Pathname.new('index.html').mime_type
|
73
|
-
# # => 'text/html'
|
74
|
-
#
|
75
|
-
# Returns a mime type String.
|
76
|
-
def mime_type
|
77
|
-
@mime_type ||= Mime.mime_for(extname)
|
78
|
-
end
|
79
|
-
|
80
|
-
# Public: Return self as String
|
81
|
-
#
|
82
|
-
# Returns a String
|
83
|
-
def to_s
|
84
|
-
@path.dup
|
85
|
-
end
|
86
|
-
|
87
|
-
def eql?(other)
|
88
|
-
other.is_a?(self.class) && @path == other.to_s
|
89
|
-
end
|
90
|
-
alias_method :==, :eql?
|
91
|
-
end
|
92
|
-
end
|
data/lib/linguist/sample.rb
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
require 'linguist/classifier'
|
2
|
-
require 'linguist/language'
|
3
|
-
|
4
|
-
module Linguist
|
5
|
-
# Model for accessing classifier training data.
|
6
|
-
class Sample
|
7
|
-
# Samples live in test/ for now, we'll eventually move them out
|
8
|
-
PATH = File.expand_path("../../../test/fixtures", __FILE__)
|
9
|
-
|
10
|
-
# Public: Iterate over each Sample.
|
11
|
-
#
|
12
|
-
# &block - Yields Sample to block
|
13
|
-
#
|
14
|
-
# Returns nothing.
|
15
|
-
def self.each(&block)
|
16
|
-
Dir.entries(PATH).each do |category|
|
17
|
-
next if category == '.' || category == '..'
|
18
|
-
|
19
|
-
# Skip text and binary for now
|
20
|
-
# Possibly reconsider this later
|
21
|
-
next if category == 'text' || category == 'binary'
|
22
|
-
|
23
|
-
# Map directory name to a Language alias
|
24
|
-
language = Linguist::Language.find_by_alias(category)
|
25
|
-
raise "No language for #{category.inspect}" unless language
|
26
|
-
|
27
|
-
dirname = File.join(PATH, category)
|
28
|
-
Dir.entries(dirname).each do |filename|
|
29
|
-
next if filename == '.' || filename == '..'
|
30
|
-
yield new(File.join(dirname, filename), language)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
nil
|
35
|
-
end
|
36
|
-
|
37
|
-
# Public: Build Classifier from all samples.
|
38
|
-
#
|
39
|
-
# Returns trained Classifier.
|
40
|
-
def self.classifier
|
41
|
-
classifier = Classifier.new
|
42
|
-
each { |sample| classifier.train(sample.language, sample.data) }
|
43
|
-
classifier.gc
|
44
|
-
end
|
45
|
-
|
46
|
-
# Internal: Initialize Sample.
|
47
|
-
#
|
48
|
-
# Samples should be initialized by Sample.each.
|
49
|
-
#
|
50
|
-
# path - String full path to file.
|
51
|
-
# language - Language of sample.
|
52
|
-
def initialize(path, language)
|
53
|
-
@path = path
|
54
|
-
@language = language
|
55
|
-
end
|
56
|
-
|
57
|
-
# Public: Get full path to file.
|
58
|
-
#
|
59
|
-
# Returns String.
|
60
|
-
attr_reader :path
|
61
|
-
|
62
|
-
# Public: Get sample language.
|
63
|
-
#
|
64
|
-
# Returns Language.
|
65
|
-
attr_reader :language
|
66
|
-
|
67
|
-
# Public: Read file contents.
|
68
|
-
#
|
69
|
-
# Returns String.
|
70
|
-
def data
|
71
|
-
File.read(path)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|