nlp_toolz 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - data/.gitignore +28 -0
 - data/.rspec +2 -0
 - data/Gemfile +6 -0
 - data/Guardfile +13 -0
 - data/LICENSE.txt +22 -0
 - data/README.md +37 -0
 - data/Rakefile +15 -0
 - data/bin/nlp_toolz +92 -0
 - data/lib/nlp_toolz/helpers/lang.rb +36 -0
 - data/lib/nlp_toolz/helpers/string_extended.rb +20 -0
 - data/lib/nlp_toolz/helpers/tmp_file.rb +18 -0
 - data/lib/nlp_toolz/helpers/url_handler.rb +26 -0
 - data/lib/nlp_toolz/load_jars.rb +22 -0
 - data/lib/nlp_toolz/parser.rb +146 -0
 - data/lib/nlp_toolz/pos_tags.rb +77 -0
 - data/lib/nlp_toolz/sentences.rb +50 -0
 - data/lib/nlp_toolz/tokens.rb +48 -0
 - data/lib/nlp_toolz/version.rb +8 -0
 - data/lib/nlp_toolz.rb +84 -0
 - data/nlp_toolz.gemspec +42 -0
 - data/spec/helpers/string_extended_spec.rb +17 -0
 - data/spec/lib/nlp_toolz/parser_spec.rb +67 -0
 - data/spec/lib/nlp_toolz/pos_tags_spec.rb +67 -0
 - data/spec/lib/nlp_toolz/sentences_spec.rb +60 -0
 - data/spec/lib/nlp_toolz/tokens_spec.rb +62 -0
 - data/spec/lib/nlp_toolz_spec.rb +69 -0
 - data/spec/spec_helper.rb +16 -0
 - metadata +262 -0
 
    
        checksums.yaml
    ADDED
    
    | 
         @@ -0,0 +1,7 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ---
         
     | 
| 
      
 2 
     | 
    
         
            +
            SHA1:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 71916455cffe07c8464fb8cc1543d7b8a2ea7205
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: bc30072b7d62770c3e202e0545137056fe5a6164
         
     | 
| 
      
 5 
     | 
    
         
            +
            SHA512:
         
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 997d3fc4fb5d9c18546e1ea4c5c8acd19e61ef6979ece0d27cff540cea99c2ecae094fba16a4c3aa25dc05f1fe9282498c228a898b68b4271e493027663e0ba3
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 42d5ea917f3febe6484a80ab085f0b41515540f841edc2de4b219d06456d7d331a750fb306095336918b4c82f4cd184d1dc6099cd4ff0fd51e2cb487adab9944
         
     | 
    
        data/.gitignore
    ADDED
    
    | 
         @@ -0,0 +1,28 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            *.gem
         
     | 
| 
      
 2 
     | 
    
         
            +
            *.rbc
         
     | 
| 
      
 3 
     | 
    
         
            +
            .bundle
         
     | 
| 
      
 4 
     | 
    
         
            +
            .config
         
     | 
| 
      
 5 
     | 
    
         
            +
            .yardoc
         
     | 
| 
      
 6 
     | 
    
         
            +
            Gemfile.lock
         
     | 
| 
      
 7 
     | 
    
         
            +
            InstalledFiles
         
     | 
| 
      
 8 
     | 
    
         
            +
            _yardoc
         
     | 
| 
      
 9 
     | 
    
         
            +
            coverage
         
     | 
| 
      
 10 
     | 
    
         
            +
            doc/
         
     | 
| 
      
 11 
     | 
    
         
            +
            lib/bundler/man
         
     | 
| 
      
 12 
     | 
    
         
            +
            pkg
         
     | 
| 
      
 13 
     | 
    
         
            +
            rdoc
         
     | 
| 
      
 14 
     | 
    
         
            +
            spec/reports
         
     | 
| 
      
 15 
     | 
    
         
            +
            test/tmp
         
     | 
| 
      
 16 
     | 
    
         
            +
            test/version_tmp
         
     | 
| 
      
 17 
     | 
    
         
            +
            tmp
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            .rvmrc
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            ToDo.task
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            teste.rb
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            .DS_Store
         
     | 
| 
      
 26 
     | 
    
         
            +
            test-data/
         
     | 
| 
      
 27 
     | 
    
         
            +
            jars/*
         
     | 
| 
      
 28 
     | 
    
         
            +
            models/*
         
     | 
    
        data/.rspec
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/Guardfile
    ADDED
    
    | 
         @@ -0,0 +1,13 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # A sample Guardfile
         
     | 
| 
      
 2 
     | 
    
         
            +
            # More info at https://github.com/guard/guard#readme
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            guard :bundler do
         
     | 
| 
      
 5 
     | 
    
         
            +
              watch('Gemfile')
         
     | 
| 
      
 6 
     | 
    
         
            +
              watch(/^.+\.gemspec/)
         
     | 
| 
      
 7 
     | 
    
         
            +
            end
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            guard :rspec do
         
     | 
| 
      
 10 
     | 
    
         
            +
              watch(%r{^spec/.+_spec\.rb$})
         
     | 
| 
      
 11 
     | 
    
         
            +
              watch(%r{^lib/(.+)\.rb$})     { |m| "spec/lib/#{m[1]}_spec.rb" }
         
     | 
| 
      
 12 
     | 
    
         
            +
              watch('spec/spec_helper.rb')  { "spec" }
         
     | 
| 
      
 13 
     | 
    
         
            +
            end
         
     | 
    
        data/LICENSE.txt
    ADDED
    
    | 
         @@ -0,0 +1,22 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            Copyright (c) 2012 LeFnord
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            MIT License
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            Permission is hereby granted, free of charge, to any person obtaining
         
     | 
| 
      
 6 
     | 
    
         
            +
            a copy of this software and associated documentation files (the
         
     | 
| 
      
 7 
     | 
    
         
            +
            "Software"), to deal in the Software without restriction, including
         
     | 
| 
      
 8 
     | 
    
         
            +
            without limitation the rights to use, copy, modify, merge, publish,
         
     | 
| 
      
 9 
     | 
    
         
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         
     | 
| 
      
 10 
     | 
    
         
            +
            permit persons to whom the Software is furnished to do so, subject to
         
     | 
| 
      
 11 
     | 
    
         
            +
            the following conditions:
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            The above copyright notice and this permission notice shall be
         
     | 
| 
      
 14 
     | 
    
         
            +
            included in all copies or substantial portions of the Software.
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         
     | 
| 
      
 17 
     | 
    
         
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         
     | 
| 
      
 18 
     | 
    
         
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         
     | 
| 
      
 19 
     | 
    
         
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
         
     | 
| 
      
 20 
     | 
    
         
            +
            LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
         
     | 
| 
      
 21 
     | 
    
         
            +
            OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
         
     | 
| 
      
 22 
     | 
    
         
            +
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         
     | 
    
        data/README.md
    ADDED
    
    | 
         @@ -0,0 +1,37 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # NlpToolz
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Basic NLP tools, mostly based on [OpenNLP](http://opennlp.apache.org), at this time `sentence finder`, `tokenizer` and `POS tagger` implemented, plus [Berkeley Parser](http://code.google.com/p/berkeleyparser/).
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            ## Installation
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            Add this line to your application's Gemfile:
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                gem 'nlp_toolz'
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            And then execute:
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                $ bundle
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            Or install it yourself as:
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                $ gem install nlp_toolz
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            ## Usage
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            see: [nlp_toolz.rb](https://github.com/LeFnord/nlp_toolz/blob/master/lib/nlp_toolz.rb) and specs for usage
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            ## Contributing
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            1. Fork it
         
     | 
| 
      
 28 
     | 
    
         
            +
            2. Create your feature branch (`git checkout -b my-new-feature`)
         
     | 
| 
      
 29 
     | 
    
         
            +
            3. Commit your changes (`git commit -am 'Add some feature'`)
         
     | 
| 
      
 30 
     | 
    
         
            +
            4. Push to the branch (`git push origin my-new-feature`)
         
     | 
| 
      
 31 
     | 
    
         
            +
            5. Create new Pull Request
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
            ## Comments
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
            - removed Celluloid, do concurrency in your app, where it be used
         
     | 
| 
      
 36 
     | 
    
         
            +
            - check `load_jars` for JVM parameters
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
    
        data/Rakefile
    ADDED
    
    | 
         @@ -0,0 +1,15 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #!/usr/bin/env rake
         
     | 
| 
      
 2 
     | 
    
         
            +
            require "bundler/gem_tasks"
         
     | 
| 
      
 3 
     | 
    
         
            +
            require "awesome_print"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'rspec/core'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'rspec/core/rake_task'
         
     | 
| 
      
 6 
     | 
    
         
            +
            RSpec::Core::RakeTask.new(:spec) do |spec|
         
     | 
| 
      
 7 
     | 
    
         
            +
              spec.pattern = FileList['spec/**/*_spec.rb']
         
     | 
| 
      
 8 
     | 
    
         
            +
            end
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            task :default => :spec
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            require 'yard'
         
     | 
| 
      
 13 
     | 
    
         
            +
            YARD::Rake::YardocTask.new
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            Dir["lib/tasks/**/*.rake"].sort.each { |ext| load ext }
         
     | 
    
        data/bin/nlp_toolz
    ADDED
    
    | 
         @@ -0,0 +1,92 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #!/usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require 'gli'
         
     | 
| 
      
 4 
     | 
    
         
            +
            begin # XXX: Remove this begin/rescue before distributing your app
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'nlp_toolz'
         
     | 
| 
      
 6 
     | 
    
         
            +
            rescue LoadError
         
     | 
| 
      
 7 
     | 
    
         
            +
              STDERR.puts "In development, you need to use `bundle exec bin/nlp_toolz` to run your app"
         
     | 
| 
      
 8 
     | 
    
         
            +
              STDERR.puts "At install-time, RubyGems will make sure lib, etc. are in the load path"
         
     | 
| 
      
 9 
     | 
    
         
            +
              STDERR.puts "Feel free to remove this message from bin/NlpToolz now"
         
     | 
| 
      
 10 
     | 
    
         
            +
              exit 64
         
     | 
| 
      
 11 
     | 
    
         
            +
            end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            include GLI::App
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            # helper methods
         
     | 
| 
      
 16 
     | 
    
         
            +
            def get_out(this)
         
     | 
| 
      
 17 
     | 
    
         
            +
              ap this if $stdout.tty?
         
     | 
| 
      
 18 
     | 
    
         
            +
              $stdout.puts this unless $stdout.tty?
         
     | 
| 
      
 19 
     | 
    
         
            +
            end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            def get_in(input_arg)
         
     | 
| 
      
 22 
     | 
    
         
            +
              if File.exists?(input_arg) && !File.directory?(input_arg)
         
     | 
| 
      
 23 
     | 
    
         
            +
                return get_file(input_arg)
         
     | 
| 
      
 24 
     | 
    
         
            +
              else
         
     | 
| 
      
 25 
     | 
    
         
            +
                return input_arg
         
     | 
| 
      
 26 
     | 
    
         
            +
              end
         
     | 
| 
      
 27 
     | 
    
         
            +
            end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
            def get_file(name)
         
     | 
| 
      
 30 
     | 
    
         
            +
              file = File.open(name).gets(nil)
         
     | 
| 
      
 31 
     | 
    
         
            +
              "" if file.nil?
         
     | 
| 
      
 32 
     | 
    
         
            +
              file.force_encoding("utf-8") unless file.nil?
         
     | 
| 
      
 33 
     | 
    
         
            +
            end
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
            program_desc 'running basic NLP tasks'
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            version NlpToolz::VERSION
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
            desc 'sentence detection'
         
     | 
| 
      
 41 
     | 
    
         
            +
            arg_name 'Describe arguments to sent here'
         
     | 
| 
      
 42 
     | 
    
         
            +
            command :sent do |c|
         
     | 
| 
      
 43 
     | 
    
         
            +
              c.desc 'file input'
         
     | 
| 
      
 44 
     | 
    
         
            +
              c.arg_name '<path/to/file>'
         
     | 
| 
      
 45 
     | 
    
         
            +
              c.flag [:f,:file]
         
     | 
| 
      
 46 
     | 
    
         
            +
              c.action do |global_options,options,args|
         
     | 
| 
      
 47 
     | 
    
         
            +
                input = get_in(options[:f] || args.first)
         
     | 
| 
      
 48 
     | 
    
         
            +
                get_out NlpToolz.get_sentences(input)
         
     | 
| 
      
 49 
     | 
    
         
            +
              end
         
     | 
| 
      
 50 
     | 
    
         
            +
            end
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
            desc 'parsing text'
         
     | 
| 
      
 53 
     | 
    
         
            +
            arg_name 'Describe arguments to parse here'
         
     | 
| 
      
 54 
     | 
    
         
            +
            command :parse do |c|
         
     | 
| 
      
 55 
     | 
    
         
            +
              c.desc 'file input'
         
     | 
| 
      
 56 
     | 
    
         
            +
              c.arg_name '<path/to/file>'
         
     | 
| 
      
 57 
     | 
    
         
            +
              c.flag [:f,:file]
         
     | 
| 
      
 58 
     | 
    
         
            +
              c.action do |global_options,options,args|
         
     | 
| 
      
 59 
     | 
    
         
            +
                input = get_in(options[:f] || args.first)
         
     | 
| 
      
 60 
     | 
    
         
            +
                get_out NlpToolz.parse_text(input)
         
     | 
| 
      
 61 
     | 
    
         
            +
              end
         
     | 
| 
      
 62 
     | 
    
         
            +
            end
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
            desc 'pos tagging of text'
         
     | 
| 
      
 65 
     | 
    
         
            +
            arg_name 'Describe arguments to tag here'
         
     | 
| 
      
 66 
     | 
    
         
            +
            command :tag do |c|
         
     | 
| 
      
 67 
     | 
    
         
            +
              c.desc 'file input'
         
     | 
| 
      
 68 
     | 
    
         
            +
              c.arg_name '<path/to/file>'
         
     | 
| 
      
 69 
     | 
    
         
            +
              c.flag [:f,:file]
         
     | 
| 
      
 70 
     | 
    
         
            +
              c.action do |global_options,options,args|
         
     | 
| 
      
 71 
     | 
    
         
            +
                input = get_in(options[:f] || args.first)
         
     | 
| 
      
 72 
     | 
    
         
            +
                get_out NlpToolz.tag_text(input)
         
     | 
| 
      
 73 
     | 
    
         
            +
              end
         
     | 
| 
      
 74 
     | 
    
         
            +
            end
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
            desc 'tokenizing text'
         
     | 
| 
      
 77 
     | 
    
         
            +
            arg_name 'Describe arguments to token here'
         
     | 
| 
      
 78 
     | 
    
         
            +
            command :token do |c|
         
     | 
| 
      
 79 
     | 
    
         
            +
              c.desc 'file input'
         
     | 
| 
      
 80 
     | 
    
         
            +
              c.arg_name '<path/to/file>'
         
     | 
| 
      
 81 
     | 
    
         
            +
              c.flag [:f,:file]
         
     | 
| 
      
 82 
     | 
    
         
            +
              c.action do |global_options,options,args|
         
     | 
| 
      
 83 
     | 
    
         
            +
                input = get_in(options[:f] || args.first)
         
     | 
| 
      
 84 
     | 
    
         
            +
                get_out NlpToolz.tokenize_text(input)
         
     | 
| 
      
 85 
     | 
    
         
            +
              end
         
     | 
| 
      
 86 
     | 
    
         
            +
            end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
            on_error do |exception|
         
     | 
| 
      
 89 
     | 
    
         
            +
              true
         
     | 
| 
      
 90 
     | 
    
         
            +
            end
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
            exit run(ARGV)
         
     | 
| 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Lang
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
              include UrlHandler
         
     | 
| 
      
 4 
     | 
    
         
            +
              # get language of input
         
     | 
| 
      
 5 
     | 
    
         
            +
              def get_language(text = nil)
         
     | 
| 
      
 6 
     | 
    
         
            +
                environment = ENV['ENV_NAME'] || 'development'
         
     | 
| 
      
 7 
     | 
    
         
            +
                # ToDo 2013-03-14: respect environment
         
     | 
| 
      
 8 
     | 
    
         
            +
                case environment
         
     | 
| 
      
 9 
     | 
    
         
            +
                when 'development'
         
     | 
| 
      
 10 
     | 
    
         
            +
                  # development -> local
         
     | 
| 
      
 11 
     | 
    
         
            +
                  # uri = build_url("localhost", 9292, "/langid", nil)
         
     | 
| 
      
 12 
     | 
    
         
            +
                  uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
         
     | 
| 
      
 13 
     | 
    
         
            +
                when 'production'
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # production
         
     | 
| 
      
 15 
     | 
    
         
            +
                  uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
         
     | 
| 
      
 16 
     | 
    
         
            +
                end
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                if @input
         
     | 
| 
      
 19 
     | 
    
         
            +
                  asv_response = post_data(URI.escape(@input),uri,{'Content-type'=>'text/plain;charset=utf-8'})
         
     | 
| 
      
 20 
     | 
    
         
            +
                elsif text
         
     | 
| 
      
 21 
     | 
    
         
            +
                  asv_response = post_data(URI.escape(text),uri,{'Content-type'=>'text/plain;charset=utf-8'})
         
     | 
| 
      
 22 
     | 
    
         
            +
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
                response = MultiJson.load(asv_response.body)
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                response["lang"]
         
     | 
| 
      
 26 
     | 
    
         
            +
              end
         
     | 
| 
      
 27 
     | 
    
         
            +
              
         
     | 
| 
      
 28 
     | 
    
         
            +
              # ToDo 2013-02-26: make different lang identifier available
         
     | 
| 
      
 29 
     | 
    
         
            +
              def alternative_langs lang
         
     | 
| 
      
 30 
     | 
    
         
            +
                langs = {
         
     | 
| 
      
 31 
     | 
    
         
            +
                  en: [:eng, :english],
         
     | 
| 
      
 32 
     | 
    
         
            +
                  de: [:ger, :german]
         
     | 
| 
      
 33 
     | 
    
         
            +
                }.each.collect{|x| x.flatten}
         
     | 
| 
      
 34 
     | 
    
         
            +
              end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,20 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # coding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            class String
         
     | 
| 
      
 4 
     | 
    
         
            +
              # ToDo: check abbr against list of ..
         
     | 
| 
      
 5 
     | 
    
         
            +
              def clean_up
         
     | 
| 
      
 6 
     | 
    
         
            +
                foo = self.encode('UTF-8', :invalid => :replace, :undef => :replace)
         
     | 
| 
      
 7 
     | 
    
         
            +
                bar = foo.gsub(/[\p{Pi}\p{Pf}"'„“‘’“”«»‹›]/,'')       # quotation marks
         
     | 
| 
      
 8 
     | 
    
         
            +
                         .gsub(/\b\/\b/,' ')
         
     | 
| 
      
 9 
     | 
    
         
            +
                         .gsub(/(\p{Ps})(.)/,'\1 \2')   # left braces
         
     | 
| 
      
 10 
     | 
    
         
            +
                         .gsub(/(.)(\p{Pe})/,'\1 \2')   # right braces
         
     | 
| 
      
 11 
     | 
    
         
            +
                         .gsub(/([\w]{3,})([\.])/,'\1 \2')  # abbrevation?
         
     | 
| 
      
 12 
     | 
    
         
            +
                         .gsub(/(.)([,;:!?]+)/,'\1 \2')     # punctation
         
     | 
| 
      
 13 
     | 
    
         
            +
                bar
         
     | 
| 
      
 14 
     | 
    
         
            +
              end
         
     | 
| 
      
 15 
     | 
    
         
            +
              
         
     | 
| 
      
 16 
     | 
    
         
            +
              def basename
         
     | 
| 
      
 17 
     | 
    
         
            +
                self.split("/").last
         
     | 
| 
      
 18 
     | 
    
         
            +
              end
         
     | 
| 
      
 19 
     | 
    
         
            +
            end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
         @@ -0,0 +1,18 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'tempfile'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module TmpFile
         
     | 
| 
      
 4 
     | 
    
         
            +
              module_function
         
     | 
| 
      
 5 
     | 
    
         
            +
              
         
     | 
| 
      
 6 
     | 
    
         
            +
              def make_tmp_file_from text = nil
         
     | 
| 
      
 7 
     | 
    
         
            +
                tmp_file = ::Tempfile.new('tmp.txt')
         
     | 
| 
      
 8 
     | 
    
         
            +
                tmp_file.write text unless text.nil?
         
     | 
| 
      
 9 
     | 
    
         
            +
                tmp_file.rewind
         
     | 
| 
      
 10 
     | 
    
         
            +
                tmp_file
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
              
         
     | 
| 
      
 13 
     | 
    
         
            +
              def delete_and_unlink_tmp_file tmp_file
         
     | 
| 
      
 14 
     | 
    
         
            +
                tmp_file.close
         
     | 
| 
      
 15 
     | 
    
         
            +
                tmp_file.unlink
         
     | 
| 
      
 16 
     | 
    
         
            +
              end
         
     | 
| 
      
 17 
     | 
    
         
            +
              
         
     | 
| 
      
 18 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,26 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'uri'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'net/http'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            module UrlHandler
         
     | 
| 
      
 5 
     | 
    
         
            +
              module ClassMethods
         
     | 
| 
      
 6 
     | 
    
         
            +
              end
         
     | 
| 
      
 7 
     | 
    
         
            +
              
         
     | 
| 
      
 8 
     | 
    
         
            +
              # instance methods
         
     | 
| 
      
 9 
     | 
    
         
            +
              def build_url(host, port, path, query)
         
     | 
| 
      
 10 
     | 
    
         
            +
                return URI::HTTP.build({:host => host, :path => path, :query => query}) if port.nil?
         
     | 
| 
      
 11 
     | 
    
         
            +
                return URI::HTTP.build({:host => host, :port => port, :path => path, :query => query}) unless port.nil?
         
     | 
| 
      
 12 
     | 
    
         
            +
              end
         
     | 
| 
      
 13 
     | 
    
         
            +
              
         
     | 
| 
      
 14 
     | 
    
         
            +
              
         
     | 
| 
      
 15 
     | 
    
         
            +
              def post_data(content,uri,content_type)
         
     | 
| 
      
 16 
     | 
    
         
            +
                post = Net::HTTP::Post.new(uri.request_uri,content_type)
         
     | 
| 
      
 17 
     | 
    
         
            +
                post.body = content.force_encoding("utf-8")
         
     | 
| 
      
 18 
     | 
    
         
            +
                uri_response = Net::HTTP.start(uri.host,uri.port) {|http| http.request(post)}
         
     | 
| 
      
 19 
     | 
    
         
            +
                
         
     | 
| 
      
 20 
     | 
    
         
            +
                uri_response
         
     | 
| 
      
 21 
     | 
    
         
            +
              end
         
     | 
| 
      
 22 
     | 
    
         
            +
              
         
     | 
| 
      
 23 
     | 
    
         
            +
              def self.included(receiver)
         
     | 
| 
      
 24 
     | 
    
         
            +
                receiver.extend ClassMethods
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,22 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module NlpToolz
         
     | 
| 
      
 2 
     | 
    
         
            +
              MODELS = File.join(File.dirname(__FILE__), '..', '..', "models")
         
     | 
| 
      
 3 
     | 
    
         
            +
              JARS = File.join(File.dirname(__FILE__), '..', '..', "jars")
         
     | 
| 
      
 4 
     | 
    
         
            +
              
         
     | 
| 
      
 5 
     | 
    
         
            +
              CLASS_PATH = [
         
     | 
| 
      
 6 
     | 
    
         
            +
                File.join(JARS, "jwnl-1.3.3.jar"),
         
     | 
| 
      
 7 
     | 
    
         
            +
                File.join(JARS, "opennlp-tools-1.5.3.jar"),
         
     | 
| 
      
 8 
     | 
    
         
            +
                File.join(JARS, "opennlp-maxent-3.0.3.jar")
         
     | 
| 
      
 9 
     | 
    
         
            +
              ].join(":")
         
     | 
| 
      
 10 
     | 
    
         
            +
              
         
     | 
| 
      
 11 
     | 
    
         
            +
              Rjb::load(CLASS_PATH,['-Xmx4096m','-Djava.awt.headless=true'])
         
     | 
| 
      
 12 
     | 
    
         
            +
              # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseParallelGC','-XX:+UseParallelOldGC','-Djava.awt.headless=true'])
         
     | 
| 
      
 13 
     | 
    
         
            +
              # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseConcMarkSweepGC','-Djava.awt.headless=true'])
         
     | 
| 
      
 14 
     | 
    
         
            +
              # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseSerialGC','-Djava.awt.headless=true'])
         
     | 
| 
      
 15 
     | 
    
         
            +
            end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            # simple example benchmarks, pos tagging 862 pharses:
         
     | 
| 
      
 18 
     | 
    
         
            +
            # /wo extra options  -> 656s
         
     | 
| 
      
 19 
     | 
    
         
            +
            # /w ParallelGC      -> 657s
         
     | 
| 
      
 20 
     | 
    
         
            +
            # /w ConcMarkSweepGC -> 659s
         
     | 
| 
      
 21 
     | 
    
         
            +
            # /w SerialGC        -> 668s
         
     | 
| 
      
 22 
     | 
    
         
            +
            # see: [Java GC tuning](http://www.oracle.com/technetwork/java/javase/gc-tuning-6-140523.html)
         
     | 
| 
         @@ -0,0 +1,146 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # coding:  utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            # @author: LeFnord
         
     | 
| 
      
 3 
     | 
    
         
            +
            # @email:  pscholz.le@gmail.com
         
     | 
| 
      
 4 
     | 
    
         
            +
            # @date:   2012-12-10
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            module NlpToolz
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
              class Parser
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                include Lang
         
     | 
| 
      
 11 
     | 
    
         
            +
                include TmpFile
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                # load java classes
         
     | 
| 
      
 14 
     | 
    
         
            +
                FileInputStream    = Rjb::import('java.io.FileInputStream')
         
     | 
| 
      
 15 
     | 
    
         
            +
                
         
     | 
| 
      
 16 
     | 
    
         
            +
                attr_reader :parsed
         
     | 
| 
      
 17 
     | 
    
         
            +
                attr_accessor :input, :lang, :model, :model_name, :parse_hash
         
     | 
| 
      
 18 
     | 
    
         
            +
                
         
     | 
| 
      
 19 
     | 
    
         
            +
                def initialize(input, lang = nil)
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @input = input
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @lang = lang || get_language
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @model_name = "#{@lang}-sm5.gr"
         
     | 
| 
      
 23 
     | 
    
         
            +
                  get_model
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
                
         
     | 
| 
      
 26 
     | 
    
         
            +
                def parse_text
         
     | 
| 
      
 27 
     | 
    
         
            +
                  parsed = nil
         
     | 
| 
      
 28 
     | 
    
         
            +
                  if self.has_model?
         
     | 
| 
      
 29 
     | 
    
         
            +
                    jar = "#{JARS}/BerkeleyParser-1.7.jar"
         
     | 
| 
      
 30 
     | 
    
         
            +
                    in_file = make_tmp_file_from @input.clean_up
         
     | 
| 
      
 31 
     | 
    
         
            +
                    out_file = make_tmp_file_from
         
     | 
| 
      
 32 
     | 
    
         
            +
                    `java -Xmx4g -jar #{jar} -gr #{@model} -inputFile #{in_file.path} -outputFile #{out_file.path} -tokenize -maxLength 500`.chomp
         
     | 
| 
      
 33 
     | 
    
         
            +
                    @parsed = File.open(out_file).gets(nil).chomp
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                    parse_output_to_hash
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                    delete_and_unlink_tmp_file in_file
         
     | 
| 
      
 38 
     | 
    
         
            +
                    delete_and_unlink_tmp_file out_file
         
     | 
| 
      
 39 
     | 
    
         
            +
                  end
         
     | 
| 
      
 40 
     | 
    
         
            +
                end
         
     | 
| 
      
 41 
     | 
    
         
            +
                
         
     | 
| 
      
 42 
     | 
    
         
            +
                def has_model?
         
     | 
| 
      
 43 
     | 
    
         
            +
                  @model
         
     | 
| 
      
 44 
     | 
    
         
            +
                end
         
     | 
| 
      
 45 
     | 
    
         
            +
                
         
     | 
| 
      
 46 
     | 
    
         
            +
                def layer(level = nil)
         
     | 
| 
      
 47 
     | 
    
         
            +
                  @first_layer
         
     | 
| 
      
 48 
     | 
    
         
            +
                end
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                def hash
         
     | 
| 
      
 51 
     | 
    
         
            +
                  @parse_hash
         
     | 
| 
      
 52 
     | 
    
         
            +
                end
         
     | 
| 
      
 53 
     | 
    
         
            +
                
         
     | 
| 
      
 54 
     | 
    
         
            +
                private
         
     | 
| 
      
 55 
     | 
    
         
            +
                
         
     | 
| 
      
 56 
     | 
    
         
            +
                # helper for ...
         
     | 
| 
      
 57 
     | 
    
         
            +
                # initialize
         
     | 
| 
      
 58 
     | 
    
         
            +
                def get_model
         
     | 
| 
      
 59 
     | 
    
         
            +
                  model_file = "#{MODELS}/parser/#{@model_name}"
         
     | 
| 
      
 60 
     | 
    
         
            +
                  if File.exists?(model_file)
         
     | 
| 
      
 61 
     | 
    
         
            +
                    @model = model_file
         
     | 
| 
      
 62 
     | 
    
         
            +
                  else
         
     | 
| 
      
 63 
     | 
    
         
            +
                    @model = false
         
     | 
| 
      
 64 
     | 
    
         
            +
                  end
         
     | 
| 
      
 65 
     | 
    
         
            +
                end
         
     | 
| 
      
 66 
     | 
    
         
            +
                
         
     | 
| 
      
 67 
     | 
    
         
            +
                # convert: #tree -> #hash
         
     | 
| 
      
 68 
     | 
    
         
            +
                def parse_output_to_hash
         
     | 
| 
      
 69 
     | 
    
         
            +
                  parsed = split_parse_tree(self.parsed)
         
     | 
| 
      
 70 
     | 
    
         
            +
                  nodes = create_leafs(parsed)
         
     | 
| 
      
 71 
     | 
    
         
            +
                  @parse_hash = make_hash_hash(nodes)
         
     | 
| 
      
 72 
     | 
    
         
            +
                  
         
     | 
| 
      
 73 
     | 
    
         
            +
                  @parse_hash
         
     | 
| 
      
 74 
     | 
    
         
            +
                end
         
     | 
| 
      
 75 
     | 
    
         
            +
                
         
     | 
| 
      
 76 
     | 
    
         
            +
                # helper for parsing to hash
         
     | 
| 
      
 77 
     | 
    
         
            +
                ::Leaf = Struct.new(:tag, :token)
         
     | 
| 
      
 78 
     | 
    
         
            +
                ::Node = Struct.new(:tag, :parent, :childs)
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                # 1. split
         
     | 
| 
      
 81 
     | 
    
         
            +
                def split_parse_tree(parsed)
         
     | 
| 
      
 82 
     | 
    
         
            +
                  bar = parsed.gsub("))", ") )").gsub("))", ") )")
         
     | 
| 
      
 83 
     | 
    
         
            +
                              .gsub("(", "{")
         
     | 
| 
      
 84 
     | 
    
         
            +
                              .gsub(")", "}")
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                  bar.split
         
     | 
| 
      
 87 
     | 
    
         
            +
                end
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
                # 2. merge tags and tokens, create leafs
         
     | 
| 
      
 90 
     | 
    
         
            +
                def create_leafs(parsed)
         
     | 
| 
      
 91 
     | 
    
         
            +
                  @first_layer = {tags: [],tokens: []}
         
     | 
| 
      
 92 
     | 
    
         
            +
                  leafs = {}
         
     | 
| 
      
 93 
     | 
    
         
            +
                  foo = []
         
     | 
| 
      
 94 
     | 
    
         
            +
                  parsed.each_with_index do |part,i|
         
     | 
| 
      
 95 
     | 
    
         
            +
                    if part =~ /\{([\w\-]+|\$\p{P}|\p{P})/ && parsed[i+1] =~ /([\p{L}\p{N}\-\.]+|\p{P})\}/
         
     | 
| 
      
 96 
     | 
    
         
            +
                      tag = part.gsub("{","")
         
     | 
| 
      
 97 
     | 
    
         
            +
                      token = parsed[i+1].gsub("}","")
         
     | 
| 
      
 98 
     | 
    
         
            +
                      @first_layer[:tags] << tag
         
     | 
| 
      
 99 
     | 
    
         
            +
                      @first_layer[:tokens] << token
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                      leaf = Leaf.new(tag.to_sym,token)
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
                      if foo[foo.length-1].is_a?(Hash)
         
     | 
| 
      
 104 
     | 
    
         
            +
                        foo[foo.length-1] = [foo[foo.length-1], leaf]
         
     | 
| 
      
 105 
     | 
    
         
            +
                      elsif foo[foo.length-1].is_a?(Array)
         
     | 
| 
      
 106 
     | 
    
         
            +
                        foo[foo.length-1] << leaf
         
     | 
| 
      
 107 
     | 
    
         
            +
                      else
         
     | 
| 
      
 108 
     | 
    
         
            +
                        foo << leaf
         
     | 
| 
      
 109 
     | 
    
         
            +
                      end
         
     | 
| 
      
 110 
     | 
    
         
            +
                    elsif part !~ /([\p{L}\p{N}\-]+|\p{P})\}/
         
     | 
| 
      
 111 
     | 
    
         
            +
                      if part =~ /(\{)(.+)/
         
     | 
| 
      
 112 
     | 
    
         
            +
                        foo << "{#{part.gsub("{","")}"
         
     | 
| 
      
 113 
     | 
    
         
            +
                      else
         
     | 
| 
      
 114 
     | 
    
         
            +
                        foo << "#{part}"
         
     | 
| 
      
 115 
     | 
    
         
            +
                      end
         
     | 
| 
      
 116 
     | 
    
         
            +
                    end
         
     | 
| 
      
 117 
     | 
    
         
            +
                  end
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
      
 119 
     | 
    
         
            +
                  foo
         
     | 
| 
      
 120 
     | 
    
         
            +
                end
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
                def make_hash_hash(nodes)
         
     | 
| 
      
 123 
     | 
    
         
            +
                  tmp = catch(:done) {
         
     | 
| 
      
 124 
     | 
    
         
            +
                    nodes.reverse.each_with_index do |node,i|
         
     | 
| 
      
 125 
     | 
    
         
            +
                      if node =~ /\{(\w+)/
         
     | 
| 
      
 126 
     | 
    
         
            +
                        key = node.match(/\{(\w+)/)[1].to_sym
         
     | 
| 
      
 127 
     | 
    
         
            +
                        part = []
         
     | 
| 
      
 128 
     | 
    
         
            +
                        nodes[-i-1..-1].each_with_index do |x,ii|
         
     | 
| 
      
 129 
     | 
    
         
            +
                          if x == "}"
         
     | 
| 
      
 130 
     | 
    
         
            +
                            part = {key => nodes[-i..-i+ii-2]}
         
     | 
| 
      
 131 
     | 
    
         
            +
                            throw :done, [nodes[0..-i-2],part,nodes[-i+ii..-1]].flatten
         
     | 
| 
      
 132 
     | 
    
         
            +
                          end
         
     | 
| 
      
 133 
     | 
    
         
            +
                        end
         
     | 
| 
      
 134 
     | 
    
         
            +
                      end
         
     | 
| 
      
 135 
     | 
    
         
            +
                    end
         
     | 
| 
      
 136 
     | 
    
         
            +
                  }
         
     | 
| 
      
 137 
     | 
    
         
            +
                  if tmp.length > 3
         
     | 
| 
      
 138 
     | 
    
         
            +
                    make_hash_hash(tmp)
         
     | 
| 
      
 139 
     | 
    
         
            +
                  else
         
     | 
| 
      
 140 
     | 
    
         
            +
                    tmp[1]
         
     | 
| 
      
 141 
     | 
    
         
            +
                  end
         
     | 
| 
      
 142 
     | 
    
         
            +
                end
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
              end # class Parser
         
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
            end # module NlpToolz
         
     | 
| 
         @@ -0,0 +1,77 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # coding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            # @author: LeFnord
         
     | 
| 
      
 3 
     | 
    
         
            +
            # @email:  pscholz.le@gmail.com
         
     | 
| 
      
 4 
     | 
    
         
            +
            # @date:   2012-10-24
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            # ToDo 2012-10-24: add train capabilities
         
     | 
| 
      
 7 
     | 
    
         
            +
            module NlpToolz
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              class PosTags
         
     | 
| 
      
 10 
     | 
    
         
            +
                
         
     | 
| 
      
 11 
     | 
    
         
            +
                include Lang
         
     | 
| 
      
 12 
     | 
    
         
            +
                
         
     | 
| 
      
 13 
     | 
    
         
            +
                # load java classes
         
     | 
| 
      
 14 
     | 
    
         
            +
                FileInputStream = Rjb::import('java.io.FileInputStream')
         
     | 
| 
      
 15 
     | 
    
         
            +
                POSModel        = Rjb::import('opennlp.tools.postag.POSModel')
         
     | 
| 
      
 16 
     | 
    
         
            +
                POSTaggerME     = Rjb::import('opennlp.tools.postag.POSTaggerME')
         
     | 
| 
      
 17 
     | 
    
         
            +
                
         
     | 
| 
      
 18 
     | 
    
         
            +
                attr_accessor :input, :lang, :model, :model_name, :tokenized
         
     | 
| 
      
 19 
     | 
    
         
            +
                
         
     | 
| 
      
 20 
     | 
    
         
            +
                def initialize(input, lang = nil)
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @input = input
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @lang = lang || get_language
         
     | 
| 
      
 23 
     | 
    
         
            +
                  @model_name = "#{@lang}-pos-maxent.bin"
         
     | 
| 
      
 24 
     | 
    
         
            +
                  get_model
         
     | 
| 
      
 25 
     | 
    
         
            +
                end
         
     | 
| 
      
 26 
     | 
    
         
            +
                
         
     | 
| 
      
 27 
     | 
    
         
            +
                def get_pos_tags
         
     | 
| 
      
 28 
     | 
    
         
            +
                  if self.has_model?
         
     | 
| 
      
 29 
     | 
    
         
            +
                    @tokenized = tokenize_it @tagger.tag(@input.clean_up)
         
     | 
| 
      
 30 
     | 
    
         
            +
                  end
         
     | 
| 
      
 31 
     | 
    
         
            +
                end
         
     | 
| 
      
 32 
     | 
    
         
            +
                
         
     | 
| 
      
 33 
     | 
    
         
            +
                def tokens
         
     | 
| 
      
 34 
     | 
    
         
            +
                  @tokenized[:tokens]
         
     | 
| 
      
 35 
     | 
    
         
            +
                end
         
     | 
| 
      
 36 
     | 
    
         
            +
                
         
     | 
| 
      
 37 
     | 
    
         
            +
                def tags
         
     | 
| 
      
 38 
     | 
    
         
            +
                  @tokenized[:tags]
         
     | 
| 
      
 39 
     | 
    
         
            +
                end
         
     | 
| 
      
 40 
     | 
    
         
            +
                
         
     | 
| 
      
 41 
     | 
    
         
            +
                def has_model?
         
     | 
| 
      
 42 
     | 
    
         
            +
                  @model
         
     | 
| 
      
 43 
     | 
    
         
            +
                end
         
     | 
| 
      
 44 
     | 
    
         
            +
                
         
     | 
| 
      
 45 
     | 
    
         
            +
                private
         
     | 
| 
      
 46 
     | 
    
         
            +
                
         
     | 
| 
      
 47 
     | 
    
         
            +
                def get_model
         
     | 
| 
      
 48 
     | 
    
         
            +
                  model_file = "#{MODELS}/pos/#{@model_name}"
         
     | 
| 
      
 49 
     | 
    
         
            +
                  if File.exists?(model_file)
         
     | 
| 
      
 50 
     | 
    
         
            +
                    @model = POSModel.new(FileInputStream.new(model_file))
         
     | 
| 
      
 51 
     | 
    
         
            +
                    @tagger = POSTaggerME.new(@model)
         
     | 
| 
      
 52 
     | 
    
         
            +
                  else
         
     | 
| 
      
 53 
     | 
    
         
            +
                    @model = false
         
     | 
| 
      
 54 
     | 
    
         
            +
                  end
         
     | 
| 
      
 55 
     | 
    
         
            +
                end
         
     | 
| 
      
 56 
     | 
    
         
            +
                
         
     | 
| 
      
 57 
     | 
    
         
            +
                # ToDo 2012-11-28: only a workaround upto the opennlp tokenizer is implemented
         
     | 
| 
      
 58 
     | 
    
         
            +
                def tokenize_it stream
         
     | 
| 
      
 59 
     | 
    
         
            +
                  foo = {tokens: [], tags: []}
         
     | 
| 
      
 60 
     | 
    
         
            +
                  stream.split.each do |token|
         
     | 
| 
      
 61 
     | 
    
         
            +
                    splitter = token.split("/")
         
     | 
| 
      
 62 
     | 
    
         
            +
                    if splitter.length == 2
         
     | 
| 
      
 63 
     | 
    
         
            +
                      foo[:tokens] << splitter.first
         
     | 
| 
      
 64 
     | 
    
         
            +
                      foo[:tags] << splitter.last
         
     | 
| 
      
 65 
     | 
    
         
            +
                    else
         
     | 
| 
      
 66 
     | 
    
         
            +
                      splitter[0..-2].each do |splits|
         
     | 
| 
      
 67 
     | 
    
         
            +
                        foo[:tokens] << splits
         
     | 
| 
      
 68 
     | 
    
         
            +
                        foo[:tags] << splitter.last
         
     | 
| 
      
 69 
     | 
    
         
            +
                      end
         
     | 
| 
      
 70 
     | 
    
         
            +
                    end
         
     | 
| 
      
 71 
     | 
    
         
            +
                  end
         
     | 
| 
      
 72 
     | 
    
         
            +
                  foo
         
     | 
| 
      
 73 
     | 
    
         
            +
                end
         
     | 
| 
      
 74 
     | 
    
         
            +
                
         
     | 
| 
      
 75 
     | 
    
         
            +
              end # class PosTags
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
            end # module NlpToolz
         
     | 
| 
         @@ -0,0 +1,50 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # coding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            # @author: LeFnord
         
     | 
| 
      
 3 
     | 
    
         
            +
            # @email:  pscholz.le@gmail.com
         
     | 
| 
      
 4 
     | 
    
         
            +
            # @date:   2012-10-23
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            # ToDo 2012-10-24: add train capabilities
         
     | 
| 
      
 7 
     | 
    
         
            +
            module NlpToolz
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              class Sentences
         
     | 
| 
      
 10 
     | 
    
         
            +
                
         
     | 
| 
      
 11 
     | 
    
         
            +
                include Lang
         
     | 
| 
      
 12 
     | 
    
         
            +
                
         
     | 
| 
      
 13 
     | 
    
         
            +
                # load java classes
         
     | 
| 
      
 14 
     | 
    
         
            +
                FileInputStream    = Rjb::import('java.io.FileInputStream')
         
     | 
| 
      
 15 
     | 
    
         
            +
                SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
         
     | 
| 
      
 16 
     | 
    
         
            +
                SentenceModel      = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
         
     | 
| 
      
 17 
     | 
    
         
            +
                
         
     | 
| 
      
 18 
     | 
    
         
            +
                attr_accessor :input, :lang, :model, :model_name, :sentences
         
     | 
| 
      
 19 
     | 
    
         
            +
                
         
     | 
| 
      
 20 
     | 
    
         
            +
                def initialize(input,lang = nil)
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @input = input
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @lang = lang || get_language
         
     | 
| 
      
 23 
     | 
    
         
            +
                  @model_name = "#{@lang}-sent.bin"
         
     | 
| 
      
 24 
     | 
    
         
            +
                  get_model
         
     | 
| 
      
 25 
     | 
    
         
            +
                end
         
     | 
| 
      
 26 
     | 
    
         
            +
                
         
     | 
| 
      
 27 
     | 
    
         
            +
                def split_into_sentences
         
     | 
| 
      
 28 
     | 
    
         
            +
                  @sentences = @sentence_detector.sentDetect(@input).to_a
         
     | 
| 
      
 29 
     | 
    
         
            +
                end
         
     | 
| 
      
 30 
     | 
    
         
            +
                
         
     | 
| 
      
 31 
     | 
    
         
            +
                def has_model?
         
     | 
| 
      
 32 
     | 
    
         
            +
                  @model
         
     | 
| 
      
 33 
     | 
    
         
            +
                end
         
     | 
| 
      
 34 
     | 
    
         
            +
                
         
     | 
| 
      
 35 
     | 
    
         
            +
                private
         
     | 
| 
      
 36 
     | 
    
         
            +
                
         
     | 
| 
      
 37 
     | 
    
         
            +
                def get_model
         
     | 
| 
      
 38 
     | 
    
         
            +
                  model_file = "#{MODELS}/sent/#{@model_name}"
         
     | 
| 
      
 39 
     | 
    
         
            +
                  if File.exists?(model_file)
         
     | 
| 
      
 40 
     | 
    
         
            +
                    @model = SentenceModel.new(FileInputStream.new(model_file))
         
     | 
| 
      
 41 
     | 
    
         
            +
                    @sentence_detector = SentenceDetectorME.new(@model)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  else
         
     | 
| 
      
 43 
     | 
    
         
            +
                    @model = false
         
     | 
| 
      
 44 
     | 
    
         
            +
                    # raise 'file not found'
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
                
         
     | 
| 
      
 48 
     | 
    
         
            +
              end # class Sentences
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
            end # module NlpToolz
         
     | 
| 
         @@ -0,0 +1,48 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # coding:  utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            # @author: LeFnord
         
     | 
| 
      
 3 
     | 
    
         
            +
            # @email:  pscholz.le@gmail.com
         
     | 
| 
      
 4 
     | 
    
         
            +
            # @date:   2012-11-30
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            module NlpToolz
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
              class Tokens
         
     | 
| 
      
 9 
     | 
    
         
            +
                
         
     | 
| 
      
 10 
     | 
    
         
            +
                include Lang
         
     | 
| 
      
 11 
     | 
    
         
            +
                
         
     | 
| 
      
 12 
     | 
    
         
            +
                # load java classes
         
     | 
| 
      
 13 
     | 
    
         
            +
                FileInputStream    = Rjb::import('java.io.FileInputStream')
         
     | 
| 
      
 14 
     | 
    
         
            +
                TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
         
     | 
| 
      
 15 
     | 
    
         
            +
                TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
         
     | 
| 
      
 16 
     | 
    
         
            +
                
         
     | 
| 
      
 17 
     | 
    
         
            +
                attr_accessor :input, :lang, :model, :model_name, :tokens
         
     | 
| 
      
 18 
     | 
    
         
            +
                
         
     | 
| 
      
 19 
     | 
    
         
            +
                def initialize(input, lang = nil)
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @input = input
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @lang = lang || get_language
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @model_name = "#{@lang}-token.bin"
         
     | 
| 
      
 23 
     | 
    
         
            +
                  get_model
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
                
         
     | 
| 
      
 26 
     | 
    
         
            +
                def tokenize
         
     | 
| 
      
 27 
     | 
    
         
            +
                  @tokens = @tokenizer.tokenize(@input)
         
     | 
| 
      
 28 
     | 
    
         
            +
                end
         
     | 
| 
      
 29 
     | 
    
         
            +
                
         
     | 
| 
      
 30 
     | 
    
         
            +
                def has_model?
         
     | 
| 
      
 31 
     | 
    
         
            +
                  @model
         
     | 
| 
      
 32 
     | 
    
         
            +
                end
         
     | 
| 
      
 33 
     | 
    
         
            +
                
         
     | 
| 
      
 34 
     | 
    
         
            +
                private
         
     | 
| 
      
 35 
     | 
    
         
            +
                
         
     | 
| 
      
 36 
     | 
    
         
            +
                def get_model
         
     | 
| 
      
 37 
     | 
    
         
            +
                  model_file = "#{MODELS}/token/#{@model_name}"
         
     | 
| 
      
 38 
     | 
    
         
            +
                  if File.exists?(model_file)
         
     | 
| 
      
 39 
     | 
    
         
            +
                    @model = TokenizerModel.new(FileInputStream.new(model_file))
         
     | 
| 
      
 40 
     | 
    
         
            +
                    @tokenizer = TokenizerME.new(@model)
         
     | 
| 
      
 41 
     | 
    
         
            +
                  else
         
     | 
| 
      
 42 
     | 
    
         
            +
                    @model = false
         
     | 
| 
      
 43 
     | 
    
         
            +
                  end
         
     | 
| 
      
 44 
     | 
    
         
            +
                end
         
     | 
| 
      
 45 
     | 
    
         
            +
                
         
     | 
| 
      
 46 
     | 
    
         
            +
              end # Class Tokens
         
     | 
| 
      
 47 
     | 
    
         
            +
              
         
     | 
| 
      
 48 
     | 
    
         
            +
            end # module NlpToolz
         
     |