nlp_toolz 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +28 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/Guardfile +13 -0
- data/LICENSE.txt +22 -0
- data/README.md +37 -0
- data/Rakefile +15 -0
- data/bin/nlp_toolz +92 -0
- data/lib/nlp_toolz/helpers/lang.rb +36 -0
- data/lib/nlp_toolz/helpers/string_extended.rb +20 -0
- data/lib/nlp_toolz/helpers/tmp_file.rb +18 -0
- data/lib/nlp_toolz/helpers/url_handler.rb +26 -0
- data/lib/nlp_toolz/load_jars.rb +22 -0
- data/lib/nlp_toolz/parser.rb +146 -0
- data/lib/nlp_toolz/pos_tags.rb +77 -0
- data/lib/nlp_toolz/sentences.rb +50 -0
- data/lib/nlp_toolz/tokens.rb +48 -0
- data/lib/nlp_toolz/version.rb +8 -0
- data/lib/nlp_toolz.rb +84 -0
- data/nlp_toolz.gemspec +42 -0
- data/spec/helpers/string_extended_spec.rb +17 -0
- data/spec/lib/nlp_toolz/parser_spec.rb +67 -0
- data/spec/lib/nlp_toolz/pos_tags_spec.rb +67 -0
- data/spec/lib/nlp_toolz/sentences_spec.rb +60 -0
- data/spec/lib/nlp_toolz/tokens_spec.rb +62 -0
- data/spec/lib/nlp_toolz_spec.rb +69 -0
- data/spec/spec_helper.rb +16 -0
- metadata +262 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 71916455cffe07c8464fb8cc1543d7b8a2ea7205
|
4
|
+
data.tar.gz: bc30072b7d62770c3e202e0545137056fe5a6164
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 997d3fc4fb5d9c18546e1ea4c5c8acd19e61ef6979ece0d27cff540cea99c2ecae094fba16a4c3aa25dc05f1fe9282498c228a898b68b4271e493027663e0ba3
|
7
|
+
data.tar.gz: 42d5ea917f3febe6484a80ab085f0b41515540f841edc2de4b219d06456d7d331a750fb306095336918b4c82f4cd184d1dc6099cd4ff0fd51e2cb487adab9944
|
data/.gitignore
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
|
19
|
+
.rvmrc
|
20
|
+
|
21
|
+
ToDo.task
|
22
|
+
|
23
|
+
teste.rb
|
24
|
+
|
25
|
+
.DS_Store
|
26
|
+
test-data/
|
27
|
+
jars/*
|
28
|
+
models/*
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard :bundler do
|
5
|
+
watch('Gemfile')
|
6
|
+
watch(/^.+\.gemspec/)
|
7
|
+
end
|
8
|
+
|
9
|
+
guard :rspec do
|
10
|
+
watch(%r{^spec/.+_spec\.rb$})
|
11
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
12
|
+
watch('spec/spec_helper.rb') { "spec" }
|
13
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 LeFnord
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# NlpToolz
|
2
|
+
|
3
|
+
Basic NLP tools, mostly based on [OpenNLP](http://opennlp.apache.org), at this time `sentence finder`, `tokenizer` and `POS tagger` implemented, plus [Berkeley Parser](http://code.google.com/p/berkeleyparser/).
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'nlp_toolz'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install nlp_toolz
|
18
|
+
|
19
|
+
Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
see: [nlp_toolz.rb](https://github.com/LeFnord/nlp_toolz/blob/master/lib/nlp_toolz.rb) and specs for usage
|
24
|
+
|
25
|
+
## Contributing
|
26
|
+
|
27
|
+
1. Fork it
|
28
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
30
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
31
|
+
5. Create new Pull Request
|
32
|
+
|
33
|
+
## Comments
|
34
|
+
|
35
|
+
- removed Celluloid, do concurrency in your app, where it be used
|
36
|
+
- check `load_jars` for JVM parameters
|
37
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require "awesome_print"
|
4
|
+
require 'rspec/core'
|
5
|
+
require 'rspec/core/rake_task'
|
6
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
7
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
8
|
+
end
|
9
|
+
|
10
|
+
task :default => :spec
|
11
|
+
|
12
|
+
require 'yard'
|
13
|
+
YARD::Rake::YardocTask.new
|
14
|
+
|
15
|
+
Dir["lib/tasks/**/*.rake"].sort.each { |ext| load ext }
|
data/bin/nlp_toolz
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'gli'
|
4
|
+
begin # XXX: Remove this begin/rescue before distributing your app
|
5
|
+
require 'nlp_toolz'
|
6
|
+
rescue LoadError
|
7
|
+
STDERR.puts "In development, you need to use `bundle exec bin/nlp_toolz` to run your app"
|
8
|
+
STDERR.puts "At install-time, RubyGems will make sure lib, etc. are in the load path"
|
9
|
+
STDERR.puts "Feel free to remove this message from bin/NlpToolz now"
|
10
|
+
exit 64
|
11
|
+
end
|
12
|
+
|
13
|
+
include GLI::App
|
14
|
+
|
15
|
+
# helper methods
|
16
|
+
def get_out(this)
|
17
|
+
ap this if $stdout.tty?
|
18
|
+
$stdout.puts this unless $stdout.tty?
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_in(input_arg)
|
22
|
+
if File.exists?(input_arg) && !File.directory?(input_arg)
|
23
|
+
return get_file(input_arg)
|
24
|
+
else
|
25
|
+
return input_arg
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_file(name)
|
30
|
+
file = File.open(name).gets(nil)
|
31
|
+
"" if file.nil?
|
32
|
+
file.force_encoding("utf-8") unless file.nil?
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
program_desc 'running basic NLP tasks'
|
37
|
+
|
38
|
+
version NlpToolz::VERSION
|
39
|
+
|
40
|
+
desc 'sentence detection'
|
41
|
+
arg_name 'Describe arguments to sent here'
|
42
|
+
command :sent do |c|
|
43
|
+
c.desc 'file input'
|
44
|
+
c.arg_name '<path/to/file>'
|
45
|
+
c.flag [:f,:file]
|
46
|
+
c.action do |global_options,options,args|
|
47
|
+
input = get_in(options[:f] || args.first)
|
48
|
+
get_out NlpToolz.get_sentences(input)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
desc 'parsing text'
|
53
|
+
arg_name 'Describe arguments to parse here'
|
54
|
+
command :parse do |c|
|
55
|
+
c.desc 'file input'
|
56
|
+
c.arg_name '<path/to/file>'
|
57
|
+
c.flag [:f,:file]
|
58
|
+
c.action do |global_options,options,args|
|
59
|
+
input = get_in(options[:f] || args.first)
|
60
|
+
get_out NlpToolz.parse_text(input)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
desc 'pos tagging of text'
|
65
|
+
arg_name 'Describe arguments to tag here'
|
66
|
+
command :tag do |c|
|
67
|
+
c.desc 'file input'
|
68
|
+
c.arg_name '<path/to/file>'
|
69
|
+
c.flag [:f,:file]
|
70
|
+
c.action do |global_options,options,args|
|
71
|
+
input = get_in(options[:f] || args.first)
|
72
|
+
get_out NlpToolz.tag_text(input)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
desc 'tokenizing text'
|
77
|
+
arg_name 'Describe arguments to token here'
|
78
|
+
command :token do |c|
|
79
|
+
c.desc 'file input'
|
80
|
+
c.arg_name '<path/to/file>'
|
81
|
+
c.flag [:f,:file]
|
82
|
+
c.action do |global_options,options,args|
|
83
|
+
input = get_in(options[:f] || args.first)
|
84
|
+
get_out NlpToolz.tokenize_text(input)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
on_error do |exception|
|
89
|
+
true
|
90
|
+
end
|
91
|
+
|
92
|
+
exit run(ARGV)
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Lang
|
2
|
+
|
3
|
+
include UrlHandler
|
4
|
+
# get language of input
|
5
|
+
def get_language(text = nil)
|
6
|
+
environment = ENV['ENV_NAME'] || 'development'
|
7
|
+
# ToDo 2013-03-14: respect environment
|
8
|
+
case environment
|
9
|
+
when 'development'
|
10
|
+
# development -> local
|
11
|
+
# uri = build_url("localhost", 9292, "/langid", nil)
|
12
|
+
uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
|
13
|
+
when 'production'
|
14
|
+
# production
|
15
|
+
uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
|
16
|
+
end
|
17
|
+
|
18
|
+
if @input
|
19
|
+
asv_response = post_data(URI.escape(@input),uri,{'Content-type'=>'text/plain;charset=utf-8'})
|
20
|
+
elsif text
|
21
|
+
asv_response = post_data(URI.escape(text),uri,{'Content-type'=>'text/plain;charset=utf-8'})
|
22
|
+
end
|
23
|
+
response = MultiJson.load(asv_response.body)
|
24
|
+
|
25
|
+
response["lang"]
|
26
|
+
end
|
27
|
+
|
28
|
+
# ToDo 2013-02-26: make different lang identifier available
|
29
|
+
def alternative_langs lang
|
30
|
+
langs = {
|
31
|
+
en: [:eng, :english],
|
32
|
+
de: [:ger, :german]
|
33
|
+
}.each.collect{|x| x.flatten}
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class String
|
4
|
+
# ToDo: check abbr against list of ..
|
5
|
+
def clean_up
|
6
|
+
foo = self.encode('UTF-8', :invalid => :replace, :undef => :replace)
|
7
|
+
bar = foo.gsub(/[\p{Pi}\p{Pf}"'„“‘’“”«»‹›]/,'') # quotation marks
|
8
|
+
.gsub(/\b\/\b/,' ')
|
9
|
+
.gsub(/(\p{Ps})(.)/,'\1 \2') # left braces
|
10
|
+
.gsub(/(.)(\p{Pe})/,'\1 \2') # right braces
|
11
|
+
.gsub(/([\w]{3,})([\.])/,'\1 \2') # abbrevation?
|
12
|
+
.gsub(/(.)([,;:!?]+)/,'\1 \2') # punctation
|
13
|
+
bar
|
14
|
+
end
|
15
|
+
|
16
|
+
def basename
|
17
|
+
self.split("/").last
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
|
3
|
+
module TmpFile
|
4
|
+
module_function
|
5
|
+
|
6
|
+
def make_tmp_file_from text = nil
|
7
|
+
tmp_file = ::Tempfile.new('tmp.txt')
|
8
|
+
tmp_file.write text unless text.nil?
|
9
|
+
tmp_file.rewind
|
10
|
+
tmp_file
|
11
|
+
end
|
12
|
+
|
13
|
+
def delete_and_unlink_tmp_file tmp_file
|
14
|
+
tmp_file.close
|
15
|
+
tmp_file.unlink
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
module UrlHandler
|
5
|
+
module ClassMethods
|
6
|
+
end
|
7
|
+
|
8
|
+
# instance methods
|
9
|
+
def build_url(host, port, path, query)
|
10
|
+
return URI::HTTP.build({:host => host, :path => path, :query => query}) if port.nil?
|
11
|
+
return URI::HTTP.build({:host => host, :port => port, :path => path, :query => query}) unless port.nil?
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def post_data(content,uri,content_type)
|
16
|
+
post = Net::HTTP::Post.new(uri.request_uri,content_type)
|
17
|
+
post.body = content.force_encoding("utf-8")
|
18
|
+
uri_response = Net::HTTP.start(uri.host,uri.port) {|http| http.request(post)}
|
19
|
+
|
20
|
+
uri_response
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.included(receiver)
|
24
|
+
receiver.extend ClassMethods
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module NlpToolz
|
2
|
+
MODELS = File.join(File.dirname(__FILE__), '..', '..', "models")
|
3
|
+
JARS = File.join(File.dirname(__FILE__), '..', '..', "jars")
|
4
|
+
|
5
|
+
CLASS_PATH = [
|
6
|
+
File.join(JARS, "jwnl-1.3.3.jar"),
|
7
|
+
File.join(JARS, "opennlp-tools-1.5.3.jar"),
|
8
|
+
File.join(JARS, "opennlp-maxent-3.0.3.jar")
|
9
|
+
].join(":")
|
10
|
+
|
11
|
+
Rjb::load(CLASS_PATH,['-Xmx4096m','-Djava.awt.headless=true'])
|
12
|
+
# Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseParallelGC','-XX:+UseParallelOldGC','-Djava.awt.headless=true'])
|
13
|
+
# Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseConcMarkSweepGC','-Djava.awt.headless=true'])
|
14
|
+
# Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseSerialGC','-Djava.awt.headless=true'])
|
15
|
+
end
|
16
|
+
|
17
|
+
# simple example benchmarks, pos tagging 862 pharses:
|
18
|
+
# /wo extra options -> 656s
|
19
|
+
# /w ParallelGC -> 657s
|
20
|
+
# /w ConcMarkSweepGC -> 659s
|
21
|
+
# /w SerialGC -> 668s
|
22
|
+
# see: [Java GC tuning](http://www.oracle.com/technetwork/java/javase/gc-tuning-6-140523.html)
|
@@ -0,0 +1,146 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# @author: LeFnord
|
3
|
+
# @email: pscholz.le@gmail.com
|
4
|
+
# @date: 2012-12-10
|
5
|
+
|
6
|
+
module NlpToolz
|
7
|
+
|
8
|
+
class Parser
|
9
|
+
|
10
|
+
include Lang
|
11
|
+
include TmpFile
|
12
|
+
|
13
|
+
# load java classes
|
14
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
15
|
+
|
16
|
+
attr_reader :parsed
|
17
|
+
attr_accessor :input, :lang, :model, :model_name, :parse_hash
|
18
|
+
|
19
|
+
def initialize(input, lang = nil)
|
20
|
+
@input = input
|
21
|
+
@lang = lang || get_language
|
22
|
+
@model_name = "#{@lang}-sm5.gr"
|
23
|
+
get_model
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_text
|
27
|
+
parsed = nil
|
28
|
+
if self.has_model?
|
29
|
+
jar = "#{JARS}/BerkeleyParser-1.7.jar"
|
30
|
+
in_file = make_tmp_file_from @input.clean_up
|
31
|
+
out_file = make_tmp_file_from
|
32
|
+
`java -Xmx4g -jar #{jar} -gr #{@model} -inputFile #{in_file.path} -outputFile #{out_file.path} -tokenize -maxLength 500`.chomp
|
33
|
+
@parsed = File.open(out_file).gets(nil).chomp
|
34
|
+
|
35
|
+
parse_output_to_hash
|
36
|
+
|
37
|
+
delete_and_unlink_tmp_file in_file
|
38
|
+
delete_and_unlink_tmp_file out_file
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def has_model?
|
43
|
+
@model
|
44
|
+
end
|
45
|
+
|
46
|
+
def layer(level = nil)
|
47
|
+
@first_layer
|
48
|
+
end
|
49
|
+
|
50
|
+
def hash
|
51
|
+
@parse_hash
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
# helper for ...
|
57
|
+
# initialize
|
58
|
+
def get_model
|
59
|
+
model_file = "#{MODELS}/parser/#{@model_name}"
|
60
|
+
if File.exists?(model_file)
|
61
|
+
@model = model_file
|
62
|
+
else
|
63
|
+
@model = false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# convert: #tree -> #hash
|
68
|
+
def parse_output_to_hash
|
69
|
+
parsed = split_parse_tree(self.parsed)
|
70
|
+
nodes = create_leafs(parsed)
|
71
|
+
@parse_hash = make_hash_hash(nodes)
|
72
|
+
|
73
|
+
@parse_hash
|
74
|
+
end
|
75
|
+
|
76
|
+
# helper for parsing to hash
|
77
|
+
::Leaf = Struct.new(:tag, :token)
|
78
|
+
::Node = Struct.new(:tag, :parent, :childs)
|
79
|
+
|
80
|
+
# 1. split
|
81
|
+
def split_parse_tree(parsed)
|
82
|
+
bar = parsed.gsub("))", ") )").gsub("))", ") )")
|
83
|
+
.gsub("(", "{")
|
84
|
+
.gsub(")", "}")
|
85
|
+
|
86
|
+
bar.split
|
87
|
+
end
|
88
|
+
|
89
|
+
# 2. merge tags and tokens, create leafs
|
90
|
+
def create_leafs(parsed)
|
91
|
+
@first_layer = {tags: [],tokens: []}
|
92
|
+
leafs = {}
|
93
|
+
foo = []
|
94
|
+
parsed.each_with_index do |part,i|
|
95
|
+
if part =~ /\{([\w\-]+|\$\p{P}|\p{P})/ && parsed[i+1] =~ /([\p{L}\p{N}\-\.]+|\p{P})\}/
|
96
|
+
tag = part.gsub("{","")
|
97
|
+
token = parsed[i+1].gsub("}","")
|
98
|
+
@first_layer[:tags] << tag
|
99
|
+
@first_layer[:tokens] << token
|
100
|
+
|
101
|
+
leaf = Leaf.new(tag.to_sym,token)
|
102
|
+
|
103
|
+
if foo[foo.length-1].is_a?(Hash)
|
104
|
+
foo[foo.length-1] = [foo[foo.length-1], leaf]
|
105
|
+
elsif foo[foo.length-1].is_a?(Array)
|
106
|
+
foo[foo.length-1] << leaf
|
107
|
+
else
|
108
|
+
foo << leaf
|
109
|
+
end
|
110
|
+
elsif part !~ /([\p{L}\p{N}\-]+|\p{P})\}/
|
111
|
+
if part =~ /(\{)(.+)/
|
112
|
+
foo << "{#{part.gsub("{","")}"
|
113
|
+
else
|
114
|
+
foo << "#{part}"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
foo
|
120
|
+
end
|
121
|
+
|
122
|
+
def make_hash_hash(nodes)
|
123
|
+
tmp = catch(:done) {
|
124
|
+
nodes.reverse.each_with_index do |node,i|
|
125
|
+
if node =~ /\{(\w+)/
|
126
|
+
key = node.match(/\{(\w+)/)[1].to_sym
|
127
|
+
part = []
|
128
|
+
nodes[-i-1..-1].each_with_index do |x,ii|
|
129
|
+
if x == "}"
|
130
|
+
part = {key => nodes[-i..-i+ii-2]}
|
131
|
+
throw :done, [nodes[0..-i-2],part,nodes[-i+ii..-1]].flatten
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
}
|
137
|
+
if tmp.length > 3
|
138
|
+
make_hash_hash(tmp)
|
139
|
+
else
|
140
|
+
tmp[1]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
end # class Parser
|
145
|
+
|
146
|
+
end # module NlpToolz
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# @author: LeFnord
|
3
|
+
# @email: pscholz.le@gmail.com
|
4
|
+
# @date: 2012-10-24
|
5
|
+
|
6
|
+
# ToDo 2012-10-24: add train capabilities
|
7
|
+
module NlpToolz
|
8
|
+
|
9
|
+
class PosTags
|
10
|
+
|
11
|
+
include Lang
|
12
|
+
|
13
|
+
# load java classes
|
14
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
15
|
+
POSModel = Rjb::import('opennlp.tools.postag.POSModel')
|
16
|
+
POSTaggerME = Rjb::import('opennlp.tools.postag.POSTaggerME')
|
17
|
+
|
18
|
+
attr_accessor :input, :lang, :model, :model_name, :tokenized
|
19
|
+
|
20
|
+
def initialize(input, lang = nil)
|
21
|
+
@input = input
|
22
|
+
@lang = lang || get_language
|
23
|
+
@model_name = "#{@lang}-pos-maxent.bin"
|
24
|
+
get_model
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_pos_tags
|
28
|
+
if self.has_model?
|
29
|
+
@tokenized = tokenize_it @tagger.tag(@input.clean_up)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def tokens
|
34
|
+
@tokenized[:tokens]
|
35
|
+
end
|
36
|
+
|
37
|
+
def tags
|
38
|
+
@tokenized[:tags]
|
39
|
+
end
|
40
|
+
|
41
|
+
def has_model?
|
42
|
+
@model
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def get_model
|
48
|
+
model_file = "#{MODELS}/pos/#{@model_name}"
|
49
|
+
if File.exists?(model_file)
|
50
|
+
@model = POSModel.new(FileInputStream.new(model_file))
|
51
|
+
@tagger = POSTaggerME.new(@model)
|
52
|
+
else
|
53
|
+
@model = false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# ToDo 2012-11-28: only a workaround upto the opennlp tokenizer is implemented
|
58
|
+
def tokenize_it stream
|
59
|
+
foo = {tokens: [], tags: []}
|
60
|
+
stream.split.each do |token|
|
61
|
+
splitter = token.split("/")
|
62
|
+
if splitter.length == 2
|
63
|
+
foo[:tokens] << splitter.first
|
64
|
+
foo[:tags] << splitter.last
|
65
|
+
else
|
66
|
+
splitter[0..-2].each do |splits|
|
67
|
+
foo[:tokens] << splits
|
68
|
+
foo[:tags] << splitter.last
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
foo
|
73
|
+
end
|
74
|
+
|
75
|
+
end # class PosTags
|
76
|
+
|
77
|
+
end # module NlpToolz
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# @author: LeFnord
|
3
|
+
# @email: pscholz.le@gmail.com
|
4
|
+
# @date: 2012-10-23
|
5
|
+
|
6
|
+
# ToDo 2012-10-24: add train capabilities
|
7
|
+
module NlpToolz
|
8
|
+
|
9
|
+
class Sentences
|
10
|
+
|
11
|
+
include Lang
|
12
|
+
|
13
|
+
# load java classes
|
14
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
15
|
+
SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
|
16
|
+
SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
|
17
|
+
|
18
|
+
attr_accessor :input, :lang, :model, :model_name, :sentences
|
19
|
+
|
20
|
+
def initialize(input,lang = nil)
|
21
|
+
@input = input
|
22
|
+
@lang = lang || get_language
|
23
|
+
@model_name = "#{@lang}-sent.bin"
|
24
|
+
get_model
|
25
|
+
end
|
26
|
+
|
27
|
+
def split_into_sentences
|
28
|
+
@sentences = @sentence_detector.sentDetect(@input).to_a
|
29
|
+
end
|
30
|
+
|
31
|
+
def has_model?
|
32
|
+
@model
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def get_model
|
38
|
+
model_file = "#{MODELS}/sent/#{@model_name}"
|
39
|
+
if File.exists?(model_file)
|
40
|
+
@model = SentenceModel.new(FileInputStream.new(model_file))
|
41
|
+
@sentence_detector = SentenceDetectorME.new(@model)
|
42
|
+
else
|
43
|
+
@model = false
|
44
|
+
# raise 'file not found'
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end # class Sentences
|
49
|
+
|
50
|
+
end # module NlpToolz
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# @author: LeFnord
|
3
|
+
# @email: pscholz.le@gmail.com
|
4
|
+
# @date: 2012-11-30
|
5
|
+
|
6
|
+
module NlpToolz
|
7
|
+
|
8
|
+
class Tokens
|
9
|
+
|
10
|
+
include Lang
|
11
|
+
|
12
|
+
# load java classes
|
13
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
14
|
+
TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
|
15
|
+
TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
|
16
|
+
|
17
|
+
attr_accessor :input, :lang, :model, :model_name, :tokens
|
18
|
+
|
19
|
+
def initialize(input, lang = nil)
|
20
|
+
@input = input
|
21
|
+
@lang = lang || get_language
|
22
|
+
@model_name = "#{@lang}-token.bin"
|
23
|
+
get_model
|
24
|
+
end
|
25
|
+
|
26
|
+
def tokenize
|
27
|
+
@tokens = @tokenizer.tokenize(@input)
|
28
|
+
end
|
29
|
+
|
30
|
+
def has_model?
|
31
|
+
@model
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def get_model
|
37
|
+
model_file = "#{MODELS}/token/#{@model_name}"
|
38
|
+
if File.exists?(model_file)
|
39
|
+
@model = TokenizerModel.new(FileInputStream.new(model_file))
|
40
|
+
@tokenizer = TokenizerME.new(@model)
|
41
|
+
else
|
42
|
+
@model = false
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end # Class Tokens
|
47
|
+
|
48
|
+
end # module NlpToolz
|