docuvator 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/docuvator +9 -0
- data/lib/docuvator.rb +44 -0
- data/lib/docuvator/initialize.rb +5 -0
- data/lib/docuvator/log.rb +59 -0
- data/lib/docuvator/splitter.rb +71 -0
- data/lib/docuvator/version.rb +3 -0
- data/spec/log_spec.rb +54 -0
- data/spec/splitter_spec.rb +47 -0
- metadata +56 -0
data/bin/docuvator
ADDED
data/lib/docuvator.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'docuvator/initialize'
|
2
|
+
|
3
|
+
module Docuvator
|
4
|
+
class CLI
|
5
|
+
attr_reader :options
|
6
|
+
def initialize
|
7
|
+
@options = OpenStruct.new(
|
8
|
+
verbose: false,
|
9
|
+
debug: false,
|
10
|
+
)
|
11
|
+
parse_options
|
12
|
+
determine_log_level
|
13
|
+
end
|
14
|
+
|
15
|
+
def process(file)
|
16
|
+
Log.info "Processing #{file}"
|
17
|
+
Splitter.new(file).split
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse_options
|
21
|
+
OptionParser.new do |opt|
|
22
|
+
opt.version = VERSION
|
23
|
+
opt.on "-v", "--verbose", "Verbose output (shows INFO level log statements)" do
|
24
|
+
options.verbose = true
|
25
|
+
end
|
26
|
+
opt.on "-d", "--debug", "Debug output (shows DEBUG level log statements)" do
|
27
|
+
options.debug = true
|
28
|
+
end
|
29
|
+
end.parse!
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def determine_log_level
|
35
|
+
if options.debug
|
36
|
+
Log.level = Logger::DEBUG
|
37
|
+
Log.use_debug
|
38
|
+
elsif options.verbose
|
39
|
+
Log.level = Logger::INFO
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'singleton'
|
3
|
+
|
4
|
+
module Docuvator
|
5
|
+
class Log
|
6
|
+
include Singleton
|
7
|
+
|
8
|
+
attr_accessor :logger, :base_directory, :debugging
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
@base_directory = File.expand_path("../..", __FILE__) + "/"
|
12
|
+
@debugging = false
|
13
|
+
@logger = Logger.new(STDOUT)
|
14
|
+
@logger.level = Logger::ERROR
|
15
|
+
@logger.formatter = proc do |sev, datetime, progname, msg|
|
16
|
+
"#{msg}\n"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.use_debug
|
21
|
+
instance.debugging = true
|
22
|
+
instance.logger.formatter = proc do |sev, datetime, progname, msg|
|
23
|
+
"#{sev} [#{progname}]: #{msg}\n"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Determine the file, method, line number of the caller
|
28
|
+
def self.parse_caller(message)
|
29
|
+
if /^(?<file>.+?):(?<line>\d+)(?::in `(?<method>.*)')?/ =~ message
|
30
|
+
file = Regexp.last_match[:file]
|
31
|
+
line = Regexp.last_match[:line]
|
32
|
+
method = Regexp.last_match[:method]
|
33
|
+
"#{file.sub(instance.base_directory, "")}:#{line}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.method_missing(method, *args, &blk)
|
38
|
+
if valid_method? method
|
39
|
+
instance.logger.progname = parse_caller(caller(1).first) if instance.debugging
|
40
|
+
instance.logger.send(method, *args, &blk)
|
41
|
+
else
|
42
|
+
super
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.respond_to_missing?(method, include_all=false)
|
47
|
+
if valid_method? method
|
48
|
+
true
|
49
|
+
else
|
50
|
+
super
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.valid_method?(method)
|
55
|
+
instance.logger.respond_to? method
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Docuvator
|
2
|
+
class Splitter
|
3
|
+
attr_accessor :paragraphs, :sentences, :word_frequency
|
4
|
+
attr_reader :file
|
5
|
+
|
6
|
+
# Sentence ignore set
|
7
|
+
IGNORE_SET = ['Mr', 'Mrs', 'Ms', 'Jr', 'Dr', 'Prof', 'Sr']
|
8
|
+
|
9
|
+
REGEX_NEWLINE = Regexp.new(/\n+/)
|
10
|
+
REGEX_SPACES = Regexp.new(/\s+/)
|
11
|
+
REGEX_PUNCTUATION = Regexp.new(/[.!?,:;\[\]\(\)]+/)
|
12
|
+
REGEX_SENTENCES = Regexp.new(/(?<=[.!?]|[.!?][\'"])(?<! #{IGNORE_SET.join('\\.| ') + '\\.|'} \s[A-Z]\.)\s+/ix)
|
13
|
+
=begin
|
14
|
+
Regex to split up sentences - http://stackoverflow.com/a/5844564/583592
|
15
|
+
|
16
|
+
(?<= # Begin positive lookbehind.
|
17
|
+
[.!?] # Either an end of sentence punct,
|
18
|
+
| [.!?][\'"] # or end of sentence punct and quote.
|
19
|
+
) # End positive lookbehind.
|
20
|
+
(?<! # Begin negative lookbehind.
|
21
|
+
Mr\. # Skip either "Mr."
|
22
|
+
| Mrs\. # or "Mrs.",
|
23
|
+
| Ms\. # or "Ms.",
|
24
|
+
| Jr\. # or "Jr.",
|
25
|
+
| Dr\. # or "Dr.",
|
26
|
+
| Prof\. # or "Prof.",
|
27
|
+
| Sr\. # or "Sr.",
|
28
|
+
| \s[A-Z]\. # or initials ex: "George W. Bush",
|
29
|
+
) # End negative lookbehind.
|
30
|
+
\s+ # Split on whitespace between sentences.
|
31
|
+
/ix
|
32
|
+
|
33
|
+
=end
|
34
|
+
|
35
|
+
def initialize(file)
|
36
|
+
if File.exist? file
|
37
|
+
@file = file
|
38
|
+
else
|
39
|
+
Log.error "Cannot split #{file} as it does not exist"
|
40
|
+
exit 1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def split
|
45
|
+
# Split up text into paragraphs
|
46
|
+
@paragraphs = File.open(@file).read.split(REGEX_NEWLINE).map { |s|
|
47
|
+
s = s.gsub(REGEX_SPACES, ' ').strip
|
48
|
+
}
|
49
|
+
Log.info "Number of paragraphs: #{@paragraphs.size}"
|
50
|
+
|
51
|
+
# Split up paragraphs into sentences
|
52
|
+
@sentences = []
|
53
|
+
@paragraphs.each do |paragraph|
|
54
|
+
paragraph.split(REGEX_SENTENCES).each do |sentence|
|
55
|
+
@sentences << sentence
|
56
|
+
end
|
57
|
+
end
|
58
|
+
Log.info "Number of sentences: #{@sentences.size}"
|
59
|
+
|
60
|
+
@word_frequency = Hash.new(0)
|
61
|
+
@sentences.each do |sentence|
|
62
|
+
sentence.split.map { |s|
|
63
|
+
s = s.gsub(REGEX_PUNCTUATION, '').downcase
|
64
|
+
@word_frequency[s] += 1
|
65
|
+
s
|
66
|
+
}
|
67
|
+
end
|
68
|
+
Log.info "Unique words: #{@word_frequency.size}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
data/spec/log_spec.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include Docuvator
|
4
|
+
|
5
|
+
describe Log do
|
6
|
+
|
7
|
+
let(:log) { Class.new(Log) }
|
8
|
+
|
9
|
+
context "initializes instance" do
|
10
|
+
it "should acts as singleton" do
|
11
|
+
log.instance.should == log.instance
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should have a logger" do
|
15
|
+
log.instance.logger.should be_a Logger
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should be a Log (FakeLog)" do
|
19
|
+
log.class.should eq Log.class
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should react to Logger methods" do
|
23
|
+
Logger.public_instance_methods.each do |method|
|
24
|
+
log.valid_method?(method).should be_true
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context "#use_debug" do
|
30
|
+
it "logger's progname before" do
|
31
|
+
log.progname.should be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it "logger's progname after" do
|
35
|
+
log.use_debug
|
36
|
+
log.progname.should_not be_nil
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
context "#parse_caller" do
|
41
|
+
context "with nothing" do
|
42
|
+
it { log.parse_caller(nil).should be_nil }
|
43
|
+
end
|
44
|
+
|
45
|
+
context "with jumble (random text)" do
|
46
|
+
it { log.parse_caller("asdaacsdc").should be_nil }
|
47
|
+
end
|
48
|
+
|
49
|
+
context "with valid caller" do
|
50
|
+
it { log.parse_caller("docuvator/lib/docuvator.rb:45:in `respond_to_missing?'").should eq "docuvator/lib/docuvator.rb:45" }
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include Docuvator
|
4
|
+
|
5
|
+
describe Splitter do
|
6
|
+
let(:splitter) { Splitter.new(file) }
|
7
|
+
|
8
|
+
context "initializes instance" do
|
9
|
+
context "with valid file" do
|
10
|
+
let(:file) { fixture('lorem_ipsum.txt') }
|
11
|
+
it { splitter.file.should eq file }
|
12
|
+
end
|
13
|
+
context "with invalid file" do
|
14
|
+
let(:file) { fixture('invalid.txt') }
|
15
|
+
it { lambda { splitter }.should raise_error SystemExit }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "#split" do
|
20
|
+
context "valid file" do
|
21
|
+
let(:file) { fixture('lorem_ipsum.txt') }
|
22
|
+
let(:splitter) {
|
23
|
+
splitter = Splitter.new(file)
|
24
|
+
splitter.split
|
25
|
+
splitter
|
26
|
+
}
|
27
|
+
|
28
|
+
it { splitter.paragraphs.size.should eq 3 }
|
29
|
+
it { splitter.sentences.size.should eq 39 }
|
30
|
+
it { splitter.word_frequency.size.should eq 124 }
|
31
|
+
end
|
32
|
+
|
33
|
+
context "valid file (idempotence)" do
|
34
|
+
let(:file) { fixture('lorem_ipsum.txt') }
|
35
|
+
let(:splitter) {
|
36
|
+
splitter = Splitter.new(file)
|
37
|
+
splitter.split
|
38
|
+
splitter.split
|
39
|
+
splitter
|
40
|
+
}
|
41
|
+
|
42
|
+
it { splitter.paragraphs.size.should eq 3 }
|
43
|
+
it { splitter.sentences.size.should eq 39 }
|
44
|
+
it { splitter.word_frequency.size.should eq 124 }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
metadata
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: docuvator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Kevin Jalbert
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-07-13 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Gem which extracts document statistics and information
|
15
|
+
email:
|
16
|
+
- kevin.j.jalbert@gmail.com
|
17
|
+
executables:
|
18
|
+
- docuvator
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- lib/docuvator/initialize.rb
|
23
|
+
- lib/docuvator/log.rb
|
24
|
+
- lib/docuvator/splitter.rb
|
25
|
+
- lib/docuvator/version.rb
|
26
|
+
- lib/docuvator.rb
|
27
|
+
- spec/log_spec.rb
|
28
|
+
- spec/splitter_spec.rb
|
29
|
+
- bin/docuvator
|
30
|
+
homepage: https://github.com/kevinjalbert/docuvator
|
31
|
+
licenses: []
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
none: false
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
requirements: []
|
49
|
+
rubyforge_project:
|
50
|
+
rubygems_version: 1.8.25
|
51
|
+
signing_key:
|
52
|
+
specification_version: 3
|
53
|
+
summary: Gem which extracts document statistics and information
|
54
|
+
test_files:
|
55
|
+
- spec/log_spec.rb
|
56
|
+
- spec/splitter_spec.rb
|