string-eater 0.2.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENSE +1 -1
- data/README.md +31 -15
- data/examples/address.rb +22 -17
- data/examples/nginx.rb +21 -13
- data/ext/string-eater/c-tokenizer.c +20 -6
- data/ext/string-eater/extconf.rb +1 -0
- data/lib/c-tokenizer.rb +52 -37
- data/lib/string-eater.rb +4 -2
- data/lib/token.rb +5 -3
- data/lib/version.rb +8 -4
- data/spec/nginx_spec.rb +33 -21
- data/spec/spec_helper.rb +1 -0
- data/spec/string_eater_spec.rb +122 -130
- metadata +5 -9
- data/lib/ruby-tokenizer-each-char.rb +0 -145
- data/lib/ruby-tokenizer.rb +0 -98
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,15 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            !binary "U0hBMQ==":
         | 
| 3 | 
            +
              metadata.gz: !binary |-
         | 
| 4 | 
            +
                ZmU1MmM0ZDY2MjQ4ZWM5MmFjN2E0YzVlODJkYWIwOWFlZDYxYzYyOQ==
         | 
| 5 | 
            +
              data.tar.gz: !binary |-
         | 
| 6 | 
            +
                NjhjZGQ1ZDQwZDVjNmE0MjZjM2Q5YTljNjAwNzFhNTJhNmE2ZmFmNw==
         | 
| 7 | 
            +
            !binary "U0hBNTEy":
         | 
| 8 | 
            +
              metadata.gz: !binary |-
         | 
| 9 | 
            +
                YTg0NzZjZTFkYzhjNWVhMzE2YjNjMzQ4N2RlNWYzYTI1NWM5MTE1MjE4NGEw
         | 
| 10 | 
            +
                ODdmNjRiYWZiODVmZGY0ZmI1MTk5MmZiZGMyYTBhNTRjODZjOGM2ODRiYjM5
         | 
| 11 | 
            +
                MWMwMzJmNGVlOTAyYTI2YmY0NzM4MTEwNDM3NjI1MTE1ZmRmNDU=
         | 
| 12 | 
            +
              data.tar.gz: !binary |-
         | 
| 13 | 
            +
                NzQ2NTdlYjAzY2NiMWIzYTRkYTI1NGFhZjgxOWY0YjgxYzk4ZDkyMGU3MDAw
         | 
| 14 | 
            +
                YmQ5YjQzNDAzNGViOGJmYjFmOGI1MDIyNGI2OWNiZGVhN2ZkNWJjYTYzNTBh
         | 
| 15 | 
            +
                YWYzZWRiYjE4ODA3YjI1ZmM4NWExZmI2ZmJmMzljMDA1Nzc0ZGY=
         | 
    
        data/LICENSE
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -87,25 +87,41 @@ We can also do something like this: | |
| 87 87 | 
             
            For another example, see `examples/nginx.rb`, which defines an
         | 
| 88 88 | 
             
            [nginx](http://nginx.org) log line tokenizer.
         | 
| 89 89 |  | 
| 90 | 
            -
            ##  | 
| 90 | 
            +
            ## Non-strict usage
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            Use `set_non_strict` to indicate that separator finding should be
         | 
| 93 | 
            +
            non-strict.  This means that if the tokenizer fails to find a
         | 
| 94 | 
            +
            separator before finishing a string, it will fill in the last token
         | 
| 95 | 
            +
            with the remainder of the string.  Normally (i.e., strict usage), the
         | 
| 96 | 
            +
            token whose closing character was not found is left nil.
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            Example:
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                class PersonTokenizer < StringEater::Tokenizer 
         | 
| 101 | 
            +
                  add_field :last_name 
         | 
| 102 | 
            +
                  look_for ", "
         | 
| 103 | 
            +
                  add_field :first_name, :extract => false
         | 
| 104 | 
            +
                  look_for " | "
         | 
| 105 | 
            +
                  add_field :street_address, :extract => false 
         | 
| 106 | 
            +
                  look_for ", " 
         | 
| 107 | 
            +
                  add_field :city
         | 
| 108 | 
            +
                  look_for ", " 
         | 
| 109 | 
            +
                  add_field :state 
         | 
| 110 | 
            +
                  look_for ", " 
         | 
| 111 | 
            +
                  set_non_strict
         | 
| 112 | 
            +
                end
         | 
| 91 113 |  | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 114 | 
            +
                tokenizer = PersonTokenizer.new
         | 
| 115 | 
            +
            	string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock"
         | 
| 116 | 
            +
            	tokenizer.tokenize! string
         | 
| 94 117 |  | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
               `StringEater::Tokenizer`.
         | 
| 118 | 
            +
                puts tokenizer.last_name # => "Flinstone"
         | 
| 119 | 
            +
            	puts tokenizer.city      # => "Bedrock"  (if strict, would be nil)
         | 
| 98 120 |  | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
               implementation that is faster on Ruby than a translation of the C
         | 
| 102 | 
            -
               algorithm.  Probably not as fast (or not much faster) than using
         | 
| 103 | 
            -
               Ruby regular expressions. 
         | 
| 121 | 
            +
            Non-strict can also be set on an instance tokenizer,
         | 
| 122 | 
            +
            i.e., call `tokenizer.set_non_strict` to make `tokenizer` non-strict.
         | 
| 104 123 |  | 
| 105 | 
            -
             | 
| 106 | 
            -
               This is essentially the same as the C implementation, but written
         | 
| 107 | 
            -
               in pure Ruby.  It uses `String#each_char` and is therefore VERY
         | 
| 108 | 
            -
               SLOW!  It provides a good way to hack the algorithm, though.
         | 
| 124 | 
            +
            ## Implementation
         | 
| 109 125 |  | 
| 110 126 | 
             
            The main algorithm works by finding the start and end points of tokens
         | 
| 111 127 | 
             
            in a string.  The search is done incrementally (i.e., loop through the
         | 
    
        data/examples/address.rb
    CHANGED
    
    | @@ -1,35 +1,40 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            # once the gem is installed, you don't need this
         | 
| 2 | 
            -
             | 
| 3 | 
            -
             | 
| 4 | 
            +
            $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
         | 
| 5 | 
            +
                                                          '..', 'lib')))
         | 
| 6 | 
            +
            $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
         | 
| 7 | 
            +
                                                          '..', 'ext/string-eater')))
         | 
| 4 8 |  | 
| 5 9 | 
             
            # this is the example from the README
         | 
| 6 10 | 
             
            require 'string-eater'
         | 
| 7 11 |  | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
               | 
| 11 | 
            -
               | 
| 12 | 
            -
               | 
| 13 | 
            -
               | 
| 14 | 
            -
               | 
| 12 | 
            +
            # example tokenizer for addresses
         | 
| 13 | 
            +
            class PersonTokenizer < StringEater::Tokenizer
         | 
| 14 | 
            +
              add_field :last_name
         | 
| 15 | 
            +
              look_for ', '
         | 
| 16 | 
            +
              add_field :first_name, extract:  false
         | 
| 17 | 
            +
              look_for ' | '
         | 
| 18 | 
            +
              add_field :street_address, extract:  false
         | 
| 19 | 
            +
              look_for ', '
         | 
| 15 20 | 
             
              add_field :city
         | 
| 16 | 
            -
              look_for  | 
| 17 | 
            -
              add_field :state | 
| 18 | 
            -
              look_for  | 
| 21 | 
            +
              look_for ', '
         | 
| 22 | 
            +
              add_field :state
         | 
| 23 | 
            +
              look_for ', '
         | 
| 19 24 | 
             
            end
         | 
| 20 25 |  | 
| 21 | 
            -
            if __FILE__ == $ | 
| 26 | 
            +
            if __FILE__ == $PROGRAM_NAME
         | 
| 22 27 | 
             
              tokenizer = PersonTokenizer.new
         | 
| 23 28 | 
             
              puts tokenizer.describe_line
         | 
| 24 29 |  | 
| 25 | 
            -
              string =  | 
| 30 | 
            +
              string = 'Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000'
         | 
| 26 31 | 
             
              tokenizer.tokenize! string
         | 
| 27 32 |  | 
| 28 | 
            -
              puts tokenizer.last_name # => "Flinestone" | 
| 29 | 
            -
              puts tokenizer.city      # => "Bedrock" | 
| 33 | 
            +
              puts tokenizer.last_name # => "Flinestone"
         | 
| 34 | 
            +
              puts tokenizer.city      # => "Bedrock"
         | 
| 30 35 | 
             
              puts tokenizer.state     # => "NA"
         | 
| 31 36 |  | 
| 32 | 
            -
              tokenizer.tokenize!(string) do |tokens| | 
| 37 | 
            +
              tokenizer.tokenize!(string) do |tokens|
         | 
| 33 38 | 
             
                puts "The #{tokens[:last_name]}s live in #{tokens[:city]}"
         | 
| 34 39 | 
             
              end
         | 
| 35 40 | 
             
            end
         | 
    
        data/examples/nginx.rb
    CHANGED
    
    | @@ -1,27 +1,32 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            # once the gem is installed, you don't need this
         | 
| 2 | 
            -
             | 
| 3 | 
            -
             | 
| 4 | 
            +
            $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
         | 
| 5 | 
            +
                                                          '..', 'lib')))
         | 
| 6 | 
            +
            $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
         | 
| 7 | 
            +
                                                          '..', 'ext/string-eater')))
         | 
| 4 8 |  | 
| 5 9 | 
             
            require 'string-eater'
         | 
| 6 10 |  | 
| 11 | 
            +
            # Example tokenizer for nginx log lines
         | 
| 7 12 | 
             
            class NginxLogTokenizer < StringEater::CTokenizer
         | 
| 8 13 | 
             
              add_field :ip
         | 
| 9 | 
            -
              look_for  | 
| 10 | 
            -
              add_field :remote_user, : | 
| 11 | 
            -
              look_for  | 
| 12 | 
            -
              add_field :timestamp, : | 
| 14 | 
            +
              look_for ' - '
         | 
| 15 | 
            +
              add_field :remote_user, extract:  false
         | 
| 16 | 
            +
              look_for ' ['
         | 
| 17 | 
            +
              add_field :timestamp, extract:  false
         | 
| 13 18 | 
             
              look_for "] \""
         | 
| 14 19 | 
             
              add_field :request
         | 
| 15 20 | 
             
              look_for "\" "
         | 
| 16 21 | 
             
              add_field :status_code
         | 
| 17 | 
            -
              look_for  | 
| 18 | 
            -
              add_field :bytes_sent, : | 
| 22 | 
            +
              look_for ' '
         | 
| 23 | 
            +
              add_field :bytes_sent, extract:  false
         | 
| 19 24 | 
             
              look_for " \""
         | 
| 20 25 | 
             
              add_field :referrer_url
         | 
| 21 26 | 
             
              look_for "\" \""
         | 
| 22 27 | 
             
              add_field :user_agent
         | 
| 23 28 | 
             
              look_for "\" \""
         | 
| 24 | 
            -
              add_field :compression, : | 
| 29 | 
            +
              add_field :compression, extract:  false
         | 
| 25 30 | 
             
              look_for "\" "
         | 
| 26 31 | 
             
              add_field :remainder
         | 
| 27 32 |  | 
| @@ -47,14 +52,17 @@ class NginxLogTokenizer < StringEater::CTokenizer | |
| 47 52 | 
             
              end
         | 
| 48 53 | 
             
            end
         | 
| 49 54 |  | 
| 50 | 
            -
            if __FILE__ == $ | 
| 55 | 
            +
            if __FILE__ == $PROGRAM_NAME
         | 
| 51 56 | 
             
              tokenizer = NginxLogTokenizer.new
         | 
| 52 57 | 
             
              puts tokenizer.describe_line
         | 
| 53 58 |  | 
| 54 | 
            -
              str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500]  | 
| 59 | 
            +
              str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
         | 
| 60 | 
            +
                '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
         | 
| 61 | 
            +
                '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
         | 
| 62 | 
            +
                'Trident/5.0)" "-" "there could be" other "stuff here"'
         | 
| 55 63 |  | 
| 56 | 
            -
              puts  | 
| 57 | 
            -
              puts  | 
| 64 | 
            +
              puts 'input string: ' + str
         | 
| 65 | 
            +
              puts 'Tokens: '
         | 
| 58 66 |  | 
| 59 67 | 
             
              # use a block to work with the extracted tokens
         | 
| 60 68 | 
             
              tokenizer.tokenize!(str) do |tokens|
         | 
| @@ -10,11 +10,12 @@ static VALUE rb_cCTokenizer; | |
| 10 10 | 
             
            static VALUE rb_mStringEater;
         | 
| 11 11 |  | 
| 12 12 | 
             
            static VALUE tokenize_string(VALUE self, 
         | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 13 | 
            +
            			     VALUE string,
         | 
| 14 | 
            +
            			     VALUE tokens_to_find_indexes,
         | 
| 15 | 
            +
            			     VALUE tokens_to_find_strings,
         | 
| 16 | 
            +
            			     VALUE tokens_to_extract_indexes,
         | 
| 17 | 
            +
            			     VALUE tokens_to_extract_names,
         | 
| 18 | 
            +
            			     VALUE non_strict)
         | 
| 18 19 | 
             
            {
         | 
| 19 20 | 
             
              const char* input_string = StringValueCStr(string);
         | 
| 20 21 | 
             
              VALUE extracted_tokens = rb_hash_new();
         | 
| @@ -115,6 +116,19 @@ static VALUE tokenize_string(VALUE self, | |
| 115 116 | 
             
                }
         | 
| 116 117 | 
             
              }
         | 
| 117 118 |  | 
| 119 | 
            +
              /* 
         | 
| 120 | 
            +
                 got to the end of the string
         | 
| 121 | 
            +
                 and have an incomplete token
         | 
| 122 | 
            +
                 and not strict
         | 
| 123 | 
            +
              */
         | 
| 124 | 
            +
              if(ix == str_len && curr_token_ix < n_tokens && RTEST(non_strict))
         | 
| 125 | 
            +
              {
         | 
| 126 | 
            +
                rb_hash_aset(extracted_tokens,
         | 
| 127 | 
            +
            		 rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
         | 
| 128 | 
            +
            		 rb_usascii_str_new(input_string + startpoint,
         | 
| 129 | 
            +
            				    str_len - startpoint));
         | 
| 130 | 
            +
              }
         | 
| 131 | 
            +
             | 
| 118 132 | 
             
              curr_token_ix = n_tokens - 1;
         | 
| 119 133 |  | 
| 120 134 | 
             
              if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
         | 
| @@ -139,7 +153,7 @@ void Init_c_tokenizer_ext(void) | |
| 139 153 | 
             
              rb_cCTokenizer = rb_define_class_under(rb_mStringEater, 
         | 
| 140 154 | 
             
                  "CTokenizer", rb_cObject);
         | 
| 141 155 |  | 
| 142 | 
            -
              rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string,  | 
| 156 | 
            +
              rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 6);
         | 
| 143 157 |  | 
| 144 158 | 
             
              /* set the callback for when the extension is unloaded */
         | 
| 145 159 | 
             
              rb_set_end_proc(finalize_c_tokenizer_ext, 0);
         | 
    
        data/ext/string-eater/extconf.rb
    CHANGED
    
    
    
        data/lib/c-tokenizer.rb
    CHANGED
    
    | @@ -1,17 +1,22 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require 'c_tokenizer_ext'
         | 
| 2 4 |  | 
| 5 | 
            +
            # Ruby interface to the c extension
         | 
| 3 6 | 
             
            class StringEater::CTokenizer
         | 
| 7 | 
            +
              attr_reader :tokens
         | 
| 8 | 
            +
             | 
| 4 9 | 
             
              def self.tokens
         | 
| 5 10 | 
             
                @tokens ||= []
         | 
| 6 11 | 
             
              end
         | 
| 7 12 |  | 
| 8 | 
            -
              def self.add_field | 
| 9 | 
            -
                 | 
| 10 | 
            -
                define_method(name) {@extracted_tokens[name]}
         | 
| 13 | 
            +
              def self.add_field(name, opts = {})
         | 
| 14 | 
            +
                tokens << StringEater::Token.new_field(name, opts)
         | 
| 15 | 
            +
                define_method(name) { @extracted_tokens[name] }
         | 
| 11 16 | 
             
              end
         | 
| 12 17 |  | 
| 13 | 
            -
              def self.look_for | 
| 14 | 
            -
                 | 
| 18 | 
            +
              def self.look_for(look_for_tokens)
         | 
| 19 | 
            +
                tokens << StringEater::Token.new_separator(look_for_tokens)
         | 
| 15 20 | 
             
              end
         | 
| 16 21 |  | 
| 17 22 | 
             
              # This is very slow, only do it when necessary
         | 
| @@ -19,12 +24,20 @@ class StringEater::CTokenizer | |
| 19 24 | 
             
                Marshal.load(Marshal.dump(tokens))
         | 
| 20 25 | 
             
              end
         | 
| 21 26 |  | 
| 27 | 
            +
              def self.set_non_strict
         | 
| 28 | 
            +
                @class_non_strict = true
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
              def self.non_strict?
         | 
| 32 | 
            +
                @class_non_strict == true
         | 
| 33 | 
            +
              end
         | 
| 34 | 
            +
             | 
| 22 35 | 
             
              def initialize
         | 
| 23 36 | 
             
                refresh_tokens
         | 
| 24 37 | 
             
              end
         | 
| 25 38 |  | 
| 26 | 
            -
              def  | 
| 27 | 
            -
                @ | 
| 39 | 
            +
              def set_non_strict
         | 
| 40 | 
            +
                @non_strict = true
         | 
| 28 41 | 
             
              end
         | 
| 29 42 |  | 
| 30 43 | 
             
              def extract_all_fields
         | 
| @@ -41,7 +54,7 @@ class StringEater::CTokenizer | |
| 41 54 | 
             
                refresh_tokens
         | 
| 42 55 | 
             
              end
         | 
| 43 56 |  | 
| 44 | 
            -
              def extract_fields | 
| 57 | 
            +
              def extract_fields(*fields)
         | 
| 45 58 | 
             
                @token_filter = lambda do |t|
         | 
| 46 59 | 
             
                  t.opts[:extract] = fields.include?(t.name)
         | 
| 47 60 | 
             
                end
         | 
| @@ -52,76 +65,78 @@ class StringEater::CTokenizer | |
| 52 65 | 
             
              def refresh_tokens
         | 
| 53 66 | 
             
                @tokens = self.class.dup_tokens
         | 
| 54 67 |  | 
| 55 | 
            -
                if @token_filter
         | 
| 56 | 
            -
                  @tokens.each{|t| @token_filter.call(t)}
         | 
| 57 | 
            -
                end
         | 
| 58 | 
            -
             | 
| 59 | 
            -
                tokens_to_find = tokens.each_with_index.map do |t, i|
         | 
| 60 | 
            -
                  [i, t.string] if t.string
         | 
| 61 | 
            -
                end.compact
         | 
| 68 | 
            +
                @tokens.each { |t| @token_filter.call(t) } if @token_filter
         | 
| 62 69 |  | 
| 63 | 
            -
                 | 
| 64 | 
            -
                @ | 
| 70 | 
            +
                tokens_to_find = gen_tokens_to_find
         | 
| 71 | 
            +
                @tokens_to_find_indexes = tokens_to_find.map { |t| t[0] }
         | 
| 72 | 
            +
                @tokens_to_find_strings = tokens_to_find.map { |t| t[1] }
         | 
| 65 73 |  | 
| 66 | 
            -
                tokens_to_extract =  | 
| 67 | 
            -
             | 
| 68 | 
            -
                 | 
| 69 | 
            -
             | 
| 70 | 
            -
                @tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
         | 
| 71 | 
            -
                @tokens_to_extract_names = tokens.map{|t| t.name}
         | 
| 74 | 
            +
                tokens_to_extract = gen_tokens_to_extract
         | 
| 75 | 
            +
                @tokens_to_extract_indexes = tokens_to_extract.map { |t| t[0] }
         | 
| 76 | 
            +
                @tokens_to_extract_names = tokens.map { |t| t.name }
         | 
| 72 77 |  | 
| 73 78 | 
             
                @have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
         | 
| 74 79 | 
             
              end
         | 
| 75 80 |  | 
| 76 81 | 
             
              def describe_line
         | 
| 77 | 
            -
                tokens. | 
| 78 | 
            -
                  desc << (t.string || t.name.to_s ||  | 
| 82 | 
            +
                tokens.reduce('') do |desc, t|
         | 
| 83 | 
            +
                  desc << (t.string || t.name.to_s || 'xxxxxx')
         | 
| 79 84 | 
             
                end
         | 
| 80 85 | 
             
              end
         | 
| 81 86 |  | 
| 82 87 | 
             
              def do_extra_parsing
         | 
| 83 88 | 
             
              end
         | 
| 84 89 |  | 
| 85 | 
            -
               | 
| 90 | 
            +
              # Not sure this could be much more concise
         | 
| 91 | 
            +
              # rubocop:disable MethodLength
         | 
| 92 | 
            +
              def tokenize!(string, &block)
         | 
| 86 93 | 
             
                @string = string
         | 
| 87 94 | 
             
                @extracted_tokens ||= {}
         | 
| 88 95 | 
             
                @extracted_tokens.clear
         | 
| 96 | 
            +
                @non_strict ||= self.class.non_strict?
         | 
| 89 97 |  | 
| 90 98 | 
             
                return unless @have_tokens_to_extract
         | 
| 91 99 |  | 
| 92 | 
            -
                @extracted_tokens = ctokenize!(@string, | 
| 100 | 
            +
                @extracted_tokens = ctokenize!(@string,
         | 
| 93 101 | 
             
                                               @tokens_to_find_indexes,
         | 
| 94 102 | 
             
                                               @tokens_to_find_strings,
         | 
| 95 103 | 
             
                                               @tokens_to_extract_indexes,
         | 
| 96 | 
            -
                                               @tokens_to_extract_names | 
| 104 | 
            +
                                               @tokens_to_extract_names,
         | 
| 105 | 
            +
                                               @non_strict)
         | 
| 97 106 |  | 
| 98 107 | 
             
                # extra parsing hook
         | 
| 99 108 | 
             
                do_extra_parsing
         | 
| 100 109 |  | 
| 101 | 
            -
                if block_given?
         | 
| 102 | 
            -
                  yield @extracted_tokens
         | 
| 103 | 
            -
                end
         | 
| 110 | 
            +
                yield @extracted_tokens if block_given?
         | 
| 104 111 |  | 
| 105 112 | 
             
                # return self for chaining
         | 
| 106 113 | 
             
                self
         | 
| 107 | 
            -
              end | 
| 108 | 
            -
              
         | 
| 114 | 
            +
              end
         | 
| 115 | 
            +
              # rubocop:enable MethodLength
         | 
| 116 | 
            +
             | 
| 109 117 | 
             
              private
         | 
| 110 118 |  | 
| 111 | 
            -
              def set_token_startpoint | 
| 119 | 
            +
              def set_token_startpoint(ix, startpoint)
         | 
| 112 120 | 
             
                @tokens[ix].breakpoints[0] = startpoint
         | 
| 113 121 | 
             
              end
         | 
| 114 122 |  | 
| 115 | 
            -
              def get_token_startpoint | 
| 123 | 
            +
              def get_token_startpoint(ix)
         | 
| 116 124 | 
             
                @tokens[ix].breakpoints[0]
         | 
| 117 125 | 
             
              end
         | 
| 118 126 |  | 
| 119 | 
            -
              def set_token_endpoint | 
| 127 | 
            +
              def set_token_endpoint(ix, endpoint)
         | 
| 120 128 | 
             
                @tokens[ix].breakpoints[1] = endpoint
         | 
| 121 129 | 
             
              end
         | 
| 122 130 |  | 
| 123 | 
            -
              def extract_token? | 
| 131 | 
            +
              def extract_token?(ix)
         | 
| 124 132 | 
             
                @tokens[ix].extract?
         | 
| 125 133 | 
             
              end
         | 
| 126 134 |  | 
| 135 | 
            +
              def gen_tokens_to_find
         | 
| 136 | 
            +
                tokens.each_with_index.map { |t, i| [i, t.string] if t.string }.compact
         | 
| 137 | 
            +
              end
         | 
| 138 | 
            +
             | 
| 139 | 
            +
              def gen_tokens_to_extract
         | 
| 140 | 
            +
                tokens.each_with_index.map { |t, i| [i, t.name] if t.extract? }.compact
         | 
| 141 | 
            +
              end
         | 
| 127 142 | 
             
            end
         | 
    
        data/lib/string-eater.rb
    CHANGED
    
    | @@ -1,10 +1,12 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # Namespacing module for StringEater
         | 
| 1 4 | 
             
            module StringEater
         | 
| 2 5 | 
             
              autoload :Token, 'token'
         | 
| 3 | 
            -
              autoload :RubyTokenizer, 'ruby-tokenizer'
         | 
| 4 | 
            -
              autoload :RubyTokenizerEachCHar, 'ruby-tokenizer-each-char'
         | 
| 5 6 | 
             
              autoload :CTokenizer, 'c-tokenizer'
         | 
| 6 7 |  | 
| 7 8 | 
             
              autoload :VERSION, 'version'
         | 
| 8 9 |  | 
| 10 | 
            +
              # by default, Tokenizer is the c extension tokenizer
         | 
| 9 11 | 
             
              class Tokenizer < CTokenizer; end
         | 
| 10 12 | 
             
            end
         | 
    
        data/lib/token.rb
    CHANGED
    
    | @@ -1,9 +1,12 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # Token class used by tokenizers
         | 
| 1 4 | 
             
            class StringEater::Token
         | 
| 2 5 | 
             
              attr_accessor :name, :string, :opts, :breakpoints, :children
         | 
| 3 6 |  | 
| 4 7 | 
             
              def initialize
         | 
| 5 8 | 
             
                @opts = {}
         | 
| 6 | 
            -
                @breakpoints = [nil,nil]
         | 
| 9 | 
            +
                @breakpoints = [nil, nil]
         | 
| 7 10 | 
             
              end
         | 
| 8 11 |  | 
| 9 12 | 
             
              def extract?
         | 
| @@ -13,7 +16,7 @@ class StringEater::Token | |
| 13 16 | 
             
              def self.new_field(name, opts)
         | 
| 14 17 | 
             
                t = new
         | 
| 15 18 | 
             
                t.name = name
         | 
| 16 | 
            -
                t.opts = {: | 
| 19 | 
            +
                t.opts = { extract: true }.merge(opts)
         | 
| 17 20 | 
             
                t
         | 
| 18 21 | 
             
              end
         | 
| 19 22 |  | 
| @@ -22,5 +25,4 @@ class StringEater::Token | |
| 22 25 | 
             
                t.string = string
         | 
| 23 26 | 
             
                t
         | 
| 24 27 | 
             
              end
         | 
| 25 | 
            -
             | 
| 26 28 | 
             
            end
         | 
    
        data/lib/version.rb
    CHANGED
    
    | @@ -1,8 +1,12 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # Extend StringEater with Version
         | 
| 1 4 | 
             
            module StringEater
         | 
| 2 | 
            -
               | 
| 3 | 
            -
             | 
| 4 | 
            -
                 | 
| 5 | 
            -
                 | 
| 5 | 
            +
              # Version constants
         | 
| 6 | 
            +
              module VERSION
         | 
| 7 | 
            +
                MAJOR = 1
         | 
| 8 | 
            +
                MINOR = 0
         | 
| 9 | 
            +
                PATCH = 0
         | 
| 6 10 | 
             
                PRE   = nil
         | 
| 7 11 | 
             
                STRING = [MAJOR, MINOR, PATCH, PRE].compact.join('.')
         | 
| 8 12 | 
             
              end
         | 
    
        data/spec/nginx_spec.rb
    CHANGED
    
    | @@ -1,32 +1,44 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require 'spec_helper'
         | 
| 2 4 | 
             
            require 'string-eater'
         | 
| 3 5 |  | 
| 4 | 
            -
             | 
| 6 | 
            +
            $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
         | 
| 7 | 
            +
                                                          '..', 'examples')))
         | 
| 5 8 |  | 
| 6 9 | 
             
            require 'nginx'
         | 
| 7 10 |  | 
| 8 11 | 
             
            describe NginxLogTokenizer do
         | 
| 9 12 | 
             
              before(:each) do
         | 
| 10 13 | 
             
                @tokenizer = NginxLogTokenizer.new
         | 
| 11 | 
            -
                @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500]  | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
                   | 
| 17 | 
            -
                   | 
| 18 | 
            -
                   | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 14 | 
            +
                @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
         | 
| 15 | 
            +
                  '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
         | 
| 16 | 
            +
                  '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
         | 
| 17 | 
            +
                  'Trident/5.0)" "-" "there could be" other "stuff here"'
         | 
| 18 | 
            +
                @str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
         | 
| 19 | 
            +
                  '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
         | 
| 20 | 
            +
                  '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
         | 
| 21 | 
            +
                  'WOW64; Trident/5.0)" "-"'
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
              user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
         | 
| 25 | 
            +
                'WOW64; Trident/5.0)'
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              {
         | 
| 28 | 
            +
                ip: '73.80.217.212',
         | 
| 29 | 
            +
                request: 'GET /this_is_a_url HTTP/1.1',
         | 
| 30 | 
            +
                status_code: 304,
         | 
| 31 | 
            +
                referrer_url:  'http://referrer.com',
         | 
| 32 | 
            +
                user_agent:  user_agent,
         | 
| 33 | 
            +
                remainder:  "\"there could be\" other \"stuff here\"",
         | 
| 34 | 
            +
              }.each_pair do |token, val|
         | 
| 35 | 
            +
                it "finds the right value for #{token}" do
         | 
| 36 | 
            +
                  @tokenizer.tokenize!(@str).send(token).should == val
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
              it 'correctly handles there not being a remainder' do
         | 
| 41 | 
            +
                @tokenizer.tokenize!(@str2).remainder.should be_nil
         | 
| 42 | 
            +
              end
         | 
| 31 43 |  | 
| 32 44 | 
             
            end
         | 
    
        data/spec/spec_helper.rb
    CHANGED
    
    
    
        data/spec/string_eater_spec.rb
    CHANGED
    
    | @@ -1,193 +1,185 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require 'spec_helper'
         | 
| 2 4 | 
             
            require 'string-eater'
         | 
| 3 5 |  | 
| 4 | 
            -
            TestedClass = StringEater::CTokenizer
         | 
| 5 | 
            -
             | 
| 6 6 | 
             
            describe StringEater do
         | 
| 7 | 
            -
              it  | 
| 8 | 
            -
                StringEater::VERSION::STRING.split( | 
| 7 | 
            +
              it 'has a version' do
         | 
| 8 | 
            +
                StringEater::VERSION::STRING.split('.').size.should >= 3
         | 
| 9 9 | 
             
              end
         | 
| 10 10 | 
             
            end
         | 
| 11 11 |  | 
| 12 12 | 
             
            # normal use
         | 
| 13 | 
            -
            class Example1 <  | 
| 13 | 
            +
            class Example1 < StringEater::CTokenizer
         | 
| 14 14 | 
             
              add_field :first_word
         | 
| 15 | 
            -
              look_for  | 
| 16 | 
            -
              add_field :second_word, : | 
| 17 | 
            -
              look_for  | 
| 15 | 
            +
              look_for ' '
         | 
| 16 | 
            +
              add_field :second_word, extract: false
         | 
| 17 | 
            +
              look_for '|'
         | 
| 18 18 | 
             
              add_field :third_word
         | 
| 19 19 | 
             
            end
         | 
| 20 20 |  | 
| 21 21 | 
             
            describe Example1 do
         | 
| 22 | 
            -
             | 
| 23 | 
            -
               | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
                 | 
| 22 | 
            +
              let(:tokenizer) { Example1.new }
         | 
| 23 | 
            +
              let(:first_word1) { 'foo' }
         | 
| 24 | 
            +
              let(:second_word1) { 'bar' }
         | 
| 25 | 
            +
              let(:third_word1) { 'baz' }
         | 
| 26 | 
            +
              let(:str1) { "#{first_word1} #{second_word1}|#{third_word1}" }
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              describe '#extract_all_fields' do
         | 
| 29 | 
            +
                it 'extracts all of the fields' do
         | 
| 30 | 
            +
                  tokenizer.extract_all_fields
         | 
| 31 | 
            +
                  tokenizer.tokenize!(str1)
         | 
| 32 | 
            +
                  expect(tokenizer.first_word).to eq(first_word1)
         | 
| 33 | 
            +
                  expect(tokenizer.second_word).to eq(second_word1)
         | 
| 34 | 
            +
                  expect(tokenizer.third_word).to eq(third_word1)
         | 
| 35 | 
            +
                end
         | 
| 30 36 | 
             
              end
         | 
| 31 37 |  | 
| 32 | 
            -
              describe  | 
| 33 | 
            -
                it  | 
| 34 | 
            -
                   | 
| 38 | 
            +
              describe '#extract_no_fields' do
         | 
| 39 | 
            +
                it 'does not extract any of the fields' do
         | 
| 40 | 
            +
                  tokenizer.extract_no_fields
         | 
| 41 | 
            +
                  tokenizer.tokenize!(str1)
         | 
| 42 | 
            +
                  tokenizer.first_word.should be_nil
         | 
| 43 | 
            +
                  tokenizer.second_word.should be_nil
         | 
| 44 | 
            +
                  tokenizer.third_word.should be_nil
         | 
| 35 45 | 
             
                end
         | 
| 36 46 | 
             
              end
         | 
| 37 47 |  | 
| 38 | 
            -
              describe  | 
| 39 | 
            -
                it  | 
| 40 | 
            -
                   | 
| 41 | 
            -
                   | 
| 42 | 
            -
                   | 
| 43 | 
            -
                   | 
| 44 | 
            -
                   | 
| 48 | 
            +
              describe '#extract_fields' do
         | 
| 49 | 
            +
                it 'allows us to set which fields get extracted' do
         | 
| 50 | 
            +
                  tokenizer.extract_fields :second_word
         | 
| 51 | 
            +
                  tokenizer.tokenize!(str1)
         | 
| 52 | 
            +
                  tokenizer.first_word.should be_nil
         | 
| 53 | 
            +
                  expect(tokenizer.second_word).to eq(second_word1)
         | 
| 54 | 
            +
                  tokenizer.third_word.should be_nil
         | 
| 45 55 | 
             
                end
         | 
| 46 56 | 
             
              end
         | 
| 47 57 |  | 
| 48 | 
            -
              describe  | 
| 49 | 
            -
                it  | 
| 50 | 
            -
                   | 
| 51 | 
            -
                  @tokenizer.tokenize!(@str1)
         | 
| 52 | 
            -
                  @tokenizer.first_word.should be_nil
         | 
| 53 | 
            -
                  @tokenizer.second_word.should be_nil
         | 
| 54 | 
            -
                  @tokenizer.third_word.should be_nil
         | 
| 58 | 
            +
              describe 'tokenize!' do
         | 
| 59 | 
            +
                it 'returns itself' do
         | 
| 60 | 
            +
                  tokenizer.tokenize!(str1).should == tokenizer
         | 
| 55 61 | 
             
                end
         | 
| 56 | 
            -
              end
         | 
| 57 62 |  | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
                  @tokenizer.extract_fields :second_word
         | 
| 61 | 
            -
                  @tokenizer.tokenize!(@str1)
         | 
| 62 | 
            -
                  @tokenizer.first_word.should be_nil
         | 
| 63 | 
            -
                  @tokenizer.second_word.should == @second_word1
         | 
| 64 | 
            -
                  @tokenizer.third_word.should be_nil
         | 
| 63 | 
            +
                it 'sets the first word' do
         | 
| 64 | 
            +
                  tokenizer.tokenize!(str1).first_word.should == 'foo'
         | 
| 65 65 | 
             
                end
         | 
| 66 | 
            -
              end
         | 
| 67 66 |  | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
                  @tokenizer.tokenize!(@str1).should == @tokenizer
         | 
| 67 | 
            +
                it 'sets the third word' do
         | 
| 68 | 
            +
                  tokenizer.tokenize!(str1).third_word.should == 'baz'
         | 
| 71 69 | 
             
                end
         | 
| 72 70 |  | 
| 73 | 
            -
                it  | 
| 74 | 
            -
                   | 
| 71 | 
            +
                it 'does not set the second word' do
         | 
| 72 | 
            +
                  tokenizer.tokenize!(str1).second_word.should be_nil
         | 
| 75 73 | 
             
                end
         | 
| 76 74 |  | 
| 77 | 
            -
                it  | 
| 78 | 
            -
                   | 
| 75 | 
            +
                it 'yields a hash of tokens if a block is given' do
         | 
| 76 | 
            +
                  tokenizer.tokenize!(str1) do |tokens|
         | 
| 77 | 
            +
                    tokens[:first_word].should == 'foo'
         | 
| 78 | 
            +
                  end
         | 
| 79 79 | 
             
                end
         | 
| 80 80 |  | 
| 81 | 
            -
                it  | 
| 82 | 
            -
                   | 
| 81 | 
            +
                it 'returns everything to the end of the line for the last token' do
         | 
| 82 | 
            +
                  s = 'c defg asdf | foo , baa'
         | 
| 83 | 
            +
                  tokenizer.tokenize!("a b|#{s}").third_word.should == s
         | 
| 83 84 | 
             
                end
         | 
| 84 85 |  | 
| 85 | 
            -
                 | 
| 86 | 
            -
                   | 
| 87 | 
            -
             | 
| 86 | 
            +
                context 'when the last delimiter is missing' do
         | 
| 87 | 
            +
                  let(:s) { 'a b' }
         | 
| 88 | 
            +
                  it 'still finds the first word' do
         | 
| 89 | 
            +
                    expect(tokenizer.tokenize!(s).first_word).to eq('a')
         | 
| 88 90 | 
             
                  end
         | 
| 89 | 
            -
                end
         | 
| 90 91 |  | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
                   | 
| 94 | 
            -
                end
         | 
| 92 | 
            +
                  it 'returns nil for the second word' do
         | 
| 93 | 
            +
                    expect(tokenizer.tokenize!(s).second_word).to be_nil
         | 
| 94 | 
            +
                  end
         | 
| 95 95 |  | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
                  @tokenizer.tokenize!(s).third_word.should be_nil
         | 
| 96 | 
            +
                  it 'returns nil for the third word' do
         | 
| 97 | 
            +
                    expect(tokenizer.tokenize!(s).third_word).to be_nil
         | 
| 98 | 
            +
                  end
         | 
| 100 99 | 
             
                end
         | 
| 101 100 |  | 
| 102 | 
            -
             | 
| 101 | 
            +
                context 'when non_strict is enabled' do
         | 
| 102 | 
            +
                  before do
         | 
| 103 | 
            +
                    tokenizer.extract_all_fields
         | 
| 104 | 
            +
                    tokenizer.set_non_strict
         | 
| 105 | 
            +
                  end
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  context 'when the last delimiter is missing' do
         | 
| 108 | 
            +
                    let(:s) { 'a b' }
         | 
| 109 | 
            +
                    it 'still finds the first word' do
         | 
| 110 | 
            +
                      expect(tokenizer.tokenize!(s).first_word).to eq('a')
         | 
| 111 | 
            +
                    end
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                    it 'still finds the second word' do
         | 
| 114 | 
            +
                      expect(tokenizer.tokenize!(s).second_word).to eq('b')
         | 
| 115 | 
            +
                    end
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                    it 'returns nil for the third word' do
         | 
| 118 | 
            +
                      expect(tokenizer.tokenize!(s).third_word).to be_nil
         | 
| 119 | 
            +
                    end
         | 
| 120 | 
            +
                  end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                  context 'when the last delimiter is not missing' do
         | 
| 123 | 
            +
                    let(:s) { 'a b|c' }
         | 
| 124 | 
            +
                    it 'still finds the first word' do
         | 
| 125 | 
            +
                      expect(tokenizer.tokenize!(s).first_word).to eq('a')
         | 
| 126 | 
            +
                    end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                    it 'still finds the second word' do
         | 
| 129 | 
            +
                      expect(tokenizer.tokenize!(s).second_word).to eq('b')
         | 
| 130 | 
            +
                    end
         | 
| 103 131 |  | 
| 132 | 
            +
                    it 'returns nil for the third word' do
         | 
| 133 | 
            +
                      expect(tokenizer.tokenize!(s).third_word).to eq('c')
         | 
| 134 | 
            +
                    end
         | 
| 135 | 
            +
                  end
         | 
| 136 | 
            +
                end
         | 
| 137 | 
            +
              end
         | 
| 104 138 | 
             
            end
         | 
| 105 139 |  | 
| 106 140 | 
             
            # an example where we ignore after a certain point in the string
         | 
| 107 | 
            -
            class Example2 <  | 
| 108 | 
            -
              add_field :first_word, : | 
| 109 | 
            -
              look_for  | 
| 141 | 
            +
            class Example2 < StringEater::CTokenizer
         | 
| 142 | 
            +
              add_field :first_word, extract: false
         | 
| 143 | 
            +
              look_for ' '
         | 
| 110 144 | 
             
              add_field :second_word
         | 
| 111 | 
            -
              look_for  | 
| 112 | 
            -
              add_field :third_word, : | 
| 113 | 
            -
              look_for  | 
| 145 | 
            +
              look_for ' '
         | 
| 146 | 
            +
              add_field :third_word, extract: false
         | 
| 147 | 
            +
              look_for '-'
         | 
| 114 148 | 
             
            end
         | 
| 115 149 |  | 
| 116 150 | 
             
            describe Example2 do
         | 
| 151 | 
            +
              let(:tokenizer) { Example2.new }
         | 
| 152 | 
            +
              let(:second_word1) { 'bar' }
         | 
| 153 | 
            +
              let(:str1) { "foo #{second_word1} baz-" }
         | 
| 117 154 |  | 
| 118 | 
            -
               | 
| 119 | 
            -
                 | 
| 120 | 
            -
             | 
| 121 | 
            -
             | 
| 122 | 
            -
              end
         | 
| 123 | 
            -
             | 
| 124 | 
            -
              describe "tokenize!" do
         | 
| 125 | 
            -
                it "should find the token when there is extra stuff at the end of the string" do
         | 
| 126 | 
            -
                  @tokenizer.tokenize!(@str1).second_word.should == @second_word1
         | 
| 155 | 
            +
              describe 'tokenize!' do
         | 
| 156 | 
            +
                it 'finds the token when there is extra stuff at the' +
         | 
| 157 | 
            +
                  'end of the string' do
         | 
| 158 | 
            +
                  tokenizer.tokenize!(str1).second_word.should == second_word1
         | 
| 127 159 | 
             
                end
         | 
| 128 160 | 
             
              end
         | 
| 129 161 |  | 
| 130 162 | 
             
            end
         | 
| 131 163 |  | 
| 132 164 | 
             
            # an example where the split is more than one char
         | 
| 133 | 
            -
            class Example3 <  | 
| 134 | 
            -
              look_for  | 
| 165 | 
            +
            class Example3 < StringEater::CTokenizer
         | 
| 166 | 
            +
              look_for 'foo='
         | 
| 135 167 | 
             
              add_field :foo_val
         | 
| 136 | 
            -
              look_for  | 
| 168 | 
            +
              look_for '&'
         | 
| 137 169 | 
             
            end
         | 
| 138 170 |  | 
| 139 171 | 
             
            describe Example3 do
         | 
| 140 | 
            -
               | 
| 141 | 
            -
                @tokenizer = Example3.new
         | 
| 142 | 
            -
              end
         | 
| 172 | 
            +
              let(:tokenizer) { Example3.new }
         | 
| 143 173 |  | 
| 144 | 
            -
              describe  | 
| 145 | 
            -
                it  | 
| 146 | 
            -
                   | 
| 174 | 
            +
              describe 'tokenize!' do
         | 
| 175 | 
            +
                it 'finds the token if there is only one occurrence ' +
         | 
| 176 | 
            +
                  'of the characters in the separator' do
         | 
| 177 | 
            +
                  tokenizer.tokenize!('abcd?foo=val&blah').foo_val.should == 'val'
         | 
| 147 178 | 
             
                end
         | 
| 148 179 |  | 
| 149 | 
            -
                it  | 
| 150 | 
            -
                   | 
| 180 | 
            +
                it 'still works if part of the separator token occurs' do
         | 
| 181 | 
            +
                  tokenizer.tokenize!('abcd?foo_blah=baz&foo=bar&buh')
         | 
| 182 | 
            +
                    .foo_val.should == 'bar'
         | 
| 151 183 | 
             
                end
         | 
| 152 184 | 
             
              end
         | 
| 153 185 | 
             
            end
         | 
| 154 | 
            -
             | 
| 155 | 
            -
            # CTokenizer doesn't do combine_fields because
         | 
| 156 | 
            -
            #  writing out breakpoints is a significant slow-down
         | 
| 157 | 
            -
            if TestedClass.respond_to?(:combine_fields)
         | 
| 158 | 
            -
              # an example where we combine fields
         | 
| 159 | 
            -
              class Example3 < TestedClass
         | 
| 160 | 
            -
                add_field :first_word, :extract => false
         | 
| 161 | 
            -
                look_for " \""
         | 
| 162 | 
            -
                add_field :part1, :extract => false
         | 
| 163 | 
            -
                look_for " "
         | 
| 164 | 
            -
                add_field :part2
         | 
| 165 | 
            -
                look_for " "
         | 
| 166 | 
            -
                add_field :part3, :extract => false
         | 
| 167 | 
            -
                look_for "\""
         | 
| 168 | 
            -
             | 
| 169 | 
            -
                combine_fields :from => :part1, :to => :part3, :as => :parts
         | 
| 170 | 
            -
              end
         | 
| 171 | 
            -
             | 
| 172 | 
            -
              describe Example3 do
         | 
| 173 | 
            -
                before(:each) do
         | 
| 174 | 
            -
                  @tokenizer = Example3.new
         | 
| 175 | 
            -
                  @str1 = "foo \"bar baz bang\""
         | 
| 176 | 
            -
                  @part2 = "baz"
         | 
| 177 | 
            -
                  @parts = "bar baz bang"
         | 
| 178 | 
            -
                end
         | 
| 179 | 
            -
             | 
| 180 | 
            -
                it "should extract like normal" do
         | 
| 181 | 
            -
                  @tokenizer.tokenize!(@str1).part2.should == @part2
         | 
| 182 | 
            -
                end
         | 
| 183 | 
            -
             | 
| 184 | 
            -
                it "should ignore like normal" do
         | 
| 185 | 
            -
                  @tokenizer.tokenize!(@str1).part1.should be_nil
         | 
| 186 | 
            -
                end
         | 
| 187 | 
            -
             | 
| 188 | 
            -
                it "should extract the combined field" do
         | 
| 189 | 
            -
                  @tokenizer.tokenize!(@str1).parts.should == @parts
         | 
| 190 | 
            -
                end
         | 
| 191 | 
            -
             | 
| 192 | 
            -
              end
         | 
| 193 | 
            -
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,15 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: string-eater
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 5 | 
            -
              prerelease: 
         | 
| 4 | 
            +
              version: 1.0.0
         | 
| 6 5 | 
             
            platform: ruby
         | 
| 7 6 | 
             
            authors:
         | 
| 8 7 | 
             
            - Dan Swain
         | 
| 9 8 | 
             
            autorequire: 
         | 
| 10 9 | 
             
            bindir: bin
         | 
| 11 10 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date:  | 
| 11 | 
            +
            date: 2014-01-05 00:00:00.000000000 Z
         | 
| 13 12 | 
             
            dependencies: []
         | 
| 14 13 | 
             
            description: Fast string tokenizer. Nom strings.
         | 
| 15 14 | 
             
            email:
         | 
| @@ -20,8 +19,6 @@ extensions: | |
| 20 19 | 
             
            extra_rdoc_files: []
         | 
| 21 20 | 
             
            files:
         | 
| 22 21 | 
             
            - lib/c-tokenizer.rb
         | 
| 23 | 
            -
            - lib/ruby-tokenizer-each-char.rb
         | 
| 24 | 
            -
            - lib/ruby-tokenizer.rb
         | 
| 25 22 | 
             
            - lib/string-eater.rb
         | 
| 26 23 | 
             
            - lib/token.rb
         | 
| 27 24 | 
             
            - lib/version.rb
         | 
| @@ -37,28 +34,27 @@ files: | |
| 37 34 | 
             
            - README.md
         | 
| 38 35 | 
             
            homepage: http://github.com/simplifi/string-eater
         | 
| 39 36 | 
             
            licenses: []
         | 
| 37 | 
            +
            metadata: {}
         | 
| 40 38 | 
             
            post_install_message: 
         | 
| 41 39 | 
             
            rdoc_options: []
         | 
| 42 40 | 
             
            require_paths:
         | 
| 43 41 | 
             
            - lib
         | 
| 44 42 | 
             
            - ext/string-eater
         | 
| 45 43 | 
             
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 46 | 
            -
              none: false
         | 
| 47 44 | 
             
              requirements:
         | 
| 48 45 | 
             
              - - ! '>='
         | 
| 49 46 | 
             
                - !ruby/object:Gem::Version
         | 
| 50 47 | 
             
                  version: '0'
         | 
| 51 48 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 52 | 
            -
              none: false
         | 
| 53 49 | 
             
              requirements:
         | 
| 54 50 | 
             
              - - ! '>='
         | 
| 55 51 | 
             
                - !ruby/object:Gem::Version
         | 
| 56 52 | 
             
                  version: '0'
         | 
| 57 53 | 
             
            requirements: []
         | 
| 58 54 | 
             
            rubyforge_project: 
         | 
| 59 | 
            -
            rubygems_version:  | 
| 55 | 
            +
            rubygems_version: 2.0.6
         | 
| 60 56 | 
             
            signing_key: 
         | 
| 61 | 
            -
            specification_version:  | 
| 57 | 
            +
            specification_version: 4
         | 
| 62 58 | 
             
            summary: Fast string tokenizer.  Nom strings.
         | 
| 63 59 | 
             
            test_files:
         | 
| 64 60 | 
             
            - spec/nginx_spec.rb
         | 
| @@ -1,145 +0,0 @@ | |
| 1 | 
            -
            # this tokenizer is very slow, but it illustrates the
         | 
| 2 | 
            -
            # basic idea of the C tokenizer
         | 
| 3 | 
            -
            class StringEater::RubyTokenizerEachChar
         | 
| 4 | 
            -
             | 
| 5 | 
            -
              def self.tokens
         | 
| 6 | 
            -
                @tokens ||= []
         | 
| 7 | 
            -
              end
         | 
| 8 | 
            -
             | 
| 9 | 
            -
              def self.combined_tokens
         | 
| 10 | 
            -
                @combined_tokens ||= []
         | 
| 11 | 
            -
              end
         | 
| 12 | 
            -
             | 
| 13 | 
            -
              def self.add_field name, opts={}
         | 
| 14 | 
            -
                self.tokens << StringEater::Token::new_field(name, opts)
         | 
| 15 | 
            -
                define_method(name) {@extracted_tokens[name]}
         | 
| 16 | 
            -
              end
         | 
| 17 | 
            -
             | 
| 18 | 
            -
              def self.look_for tokens
         | 
| 19 | 
            -
                self.tokens << StringEater::Token::new_separator(tokens)
         | 
| 20 | 
            -
              end
         | 
| 21 | 
            -
             | 
| 22 | 
            -
              def self.combine_fields opts={}
         | 
| 23 | 
            -
                from_token_index = self.tokens.index{|t| t.name == opts[:from]}
         | 
| 24 | 
            -
                to_token_index = self.tokens.index{|t| t.name == opts[:to]}
         | 
| 25 | 
            -
                self.combined_tokens << [opts[:as], from_token_index, to_token_index]
         | 
| 26 | 
            -
                define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
         | 
| 27 | 
            -
              end
         | 
| 28 | 
            -
             | 
| 29 | 
            -
              def tokens
         | 
| 30 | 
            -
                @tokens ||= self.class.tokens
         | 
| 31 | 
            -
              end
         | 
| 32 | 
            -
             | 
| 33 | 
            -
              def combined_tokens
         | 
| 34 | 
            -
                @combined_tokens ||= self.class.combined_tokens
         | 
| 35 | 
            -
              end
         | 
| 36 | 
            -
             | 
| 37 | 
            -
              def refresh_tokens
         | 
| 38 | 
            -
                @combined_tokens = nil
         | 
| 39 | 
            -
                @tokens = nil
         | 
| 40 | 
            -
                tokens
         | 
| 41 | 
            -
              end
         | 
| 42 | 
            -
             | 
| 43 | 
            -
              def describe_line
         | 
| 44 | 
            -
                tokens.inject("") do |desc, t|
         | 
| 45 | 
            -
                  desc << (t.string || t.name.to_s || "xxxxxx")
         | 
| 46 | 
            -
                end
         | 
| 47 | 
            -
              end
         | 
| 48 | 
            -
             | 
| 49 | 
            -
              def find_breakpoints string
         | 
| 50 | 
            -
                tokenize!(string) unless @string == string
         | 
| 51 | 
            -
                tokens.inject([]) do |bp, t|
         | 
| 52 | 
            -
                  bp << t.breakpoints
         | 
| 53 | 
            -
                  bp
         | 
| 54 | 
            -
                end.flatten.uniq
         | 
| 55 | 
            -
              end
         | 
| 56 | 
            -
             | 
| 57 | 
            -
              def tokenize! string, &block
         | 
| 58 | 
            -
                @string = string
         | 
| 59 | 
            -
                @extracted_tokens ||= {}
         | 
| 60 | 
            -
                @extracted_tokens.clear
         | 
| 61 | 
            -
                @tokens_to_find ||= tokens.each_with_index.map do |t, i| 
         | 
| 62 | 
            -
                  [i, t.string] if t.string
         | 
| 63 | 
            -
                end.compact
         | 
| 64 | 
            -
                @tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
         | 
| 65 | 
            -
                  i if t.extract?
         | 
| 66 | 
            -
                end.compact
         | 
| 67 | 
            -
             | 
| 68 | 
            -
                tokens.first.breakpoints[0] = 0
         | 
| 69 | 
            -
             | 
| 70 | 
            -
                find_index = 0
         | 
| 71 | 
            -
             | 
| 72 | 
            -
                curr_token = @tokens_to_find[find_index]
         | 
| 73 | 
            -
                curr_token_index = curr_token[0]
         | 
| 74 | 
            -
                curr_token_length = curr_token[1].length
         | 
| 75 | 
            -
                looking_for_index = 0
         | 
| 76 | 
            -
                looking_for = curr_token[1][looking_for_index]
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                counter = 0
         | 
| 79 | 
            -
                string.each_char do |c|
         | 
| 80 | 
            -
                  if c == looking_for
         | 
| 81 | 
            -
                    if looking_for_index == 0
         | 
| 82 | 
            -
                      # entering new token
         | 
| 83 | 
            -
                      if curr_token_index > 0
         | 
| 84 | 
            -
                        t = tokens[curr_token_index - 1]
         | 
| 85 | 
            -
                        t.breakpoints[1] = counter
         | 
| 86 | 
            -
                        if t.extract?
         | 
| 87 | 
            -
                          @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
         | 
| 88 | 
            -
                        end
         | 
| 89 | 
            -
                      end
         | 
| 90 | 
            -
                      tokens[curr_token_index].breakpoints[0] = counter
         | 
| 91 | 
            -
                    end
         | 
| 92 | 
            -
                    if looking_for_index >= (curr_token_length - 1)
         | 
| 93 | 
            -
                      # leaving token
         | 
| 94 | 
            -
                      tokens[curr_token_index].breakpoints[1] = counter
         | 
| 95 | 
            -
             | 
| 96 | 
            -
                      if curr_token_index >= tokens.size-1
         | 
| 97 | 
            -
                        # we're done!
         | 
| 98 | 
            -
                        break
         | 
| 99 | 
            -
                      else
         | 
| 100 | 
            -
                        tokens[curr_token_index + 1].breakpoints[0] = counter + 1
         | 
| 101 | 
            -
                      end
         | 
| 102 | 
            -
             | 
| 103 | 
            -
                      # next token
         | 
| 104 | 
            -
                      find_index += 1
         | 
| 105 | 
            -
                      if find_index >= @tokens_to_find.length
         | 
| 106 | 
            -
                        # we're done!
         | 
| 107 | 
            -
                        break
         | 
| 108 | 
            -
                      end
         | 
| 109 | 
            -
                      curr_token = @tokens_to_find[find_index]
         | 
| 110 | 
            -
                      curr_token_index = curr_token[0]
         | 
| 111 | 
            -
                      curr_token_length = curr_token[1].length
         | 
| 112 | 
            -
                      looking_for_index = 0
         | 
| 113 | 
            -
                    else
         | 
| 114 | 
            -
                      looking_for_index += 1
         | 
| 115 | 
            -
                    end
         | 
| 116 | 
            -
                  end
         | 
| 117 | 
            -
                  looking_for = curr_token[1][looking_for_index]
         | 
| 118 | 
            -
                  counter += 1
         | 
| 119 | 
            -
                end
         | 
| 120 | 
            -
             | 
| 121 | 
            -
                last_token = tokens.last
         | 
| 122 | 
            -
                last_token.breakpoints[1] = string.length
         | 
| 123 | 
            -
             | 
| 124 | 
            -
                if last_token.extract?
         | 
| 125 | 
            -
                  @extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
         | 
| 126 | 
            -
                end
         | 
| 127 | 
            -
             | 
| 128 | 
            -
                combined_tokens.each do |combiner|
         | 
| 129 | 
            -
                  name = combiner[0]
         | 
| 130 | 
            -
                  from = @tokens[combiner[1]].breakpoints[0]
         | 
| 131 | 
            -
                  to = @tokens[combiner[2]].breakpoints[1]
         | 
| 132 | 
            -
                  @extracted_tokens[name] = string[from...to]
         | 
| 133 | 
            -
                end
         | 
| 134 | 
            -
             | 
| 135 | 
            -
                if block_given?
         | 
| 136 | 
            -
                  yield @extracted_tokens
         | 
| 137 | 
            -
                end
         | 
| 138 | 
            -
             | 
| 139 | 
            -
                # return self for chaining
         | 
| 140 | 
            -
                self
         | 
| 141 | 
            -
              end
         | 
| 142 | 
            -
             | 
| 143 | 
            -
            end
         | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
    
        data/lib/ruby-tokenizer.rb
    DELETED
    
    | @@ -1,98 +0,0 @@ | |
| 1 | 
            -
            # this tokenizer is fairly fast, but not necessarily faster than regexps
         | 
| 2 | 
            -
            class StringEater::RubyTokenizer
         | 
| 3 | 
            -
              def self.tokens
         | 
| 4 | 
            -
                @tokens ||= []
         | 
| 5 | 
            -
              end
         | 
| 6 | 
            -
             | 
| 7 | 
            -
              def self.combined_tokens
         | 
| 8 | 
            -
                @combined_tokens ||= []
         | 
| 9 | 
            -
              end
         | 
| 10 | 
            -
             | 
| 11 | 
            -
              def self.add_field name, opts={}
         | 
| 12 | 
            -
                self.tokens << StringEater::Token::new_field(name, opts)
         | 
| 13 | 
            -
                define_method(name) {@extracted_tokens[name]}
         | 
| 14 | 
            -
              end
         | 
| 15 | 
            -
             | 
| 16 | 
            -
              def self.look_for tokens
         | 
| 17 | 
            -
                self.tokens << StringEater::Token::new_separator(tokens)
         | 
| 18 | 
            -
              end
         | 
| 19 | 
            -
             | 
| 20 | 
            -
              def self.combine_fields opts={}
         | 
| 21 | 
            -
                from_token_index = self.tokens.index{|t| t.name == opts[:from]}
         | 
| 22 | 
            -
                to_token_index = self.tokens.index{|t| t.name == opts[:to]}
         | 
| 23 | 
            -
                self.combined_tokens << [opts[:as], from_token_index, to_token_index]
         | 
| 24 | 
            -
                define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
         | 
| 25 | 
            -
              end
         | 
| 26 | 
            -
             | 
| 27 | 
            -
              def tokens
         | 
| 28 | 
            -
                @tokens ||= self.class.tokens
         | 
| 29 | 
            -
              end
         | 
| 30 | 
            -
             | 
| 31 | 
            -
              def combined_tokens
         | 
| 32 | 
            -
                @combined_tokens ||= self.class.combined_tokens
         | 
| 33 | 
            -
              end
         | 
| 34 | 
            -
             | 
| 35 | 
            -
              def refresh_tokens
         | 
| 36 | 
            -
                @combined_tokens = nil
         | 
| 37 | 
            -
                @tokens = nil
         | 
| 38 | 
            -
                tokens
         | 
| 39 | 
            -
              end
         | 
| 40 | 
            -
             | 
| 41 | 
            -
              def describe_line
         | 
| 42 | 
            -
                tokens.inject("") do |desc, t|
         | 
| 43 | 
            -
                  desc << (t.string || t.name.to_s || "xxxxxx")
         | 
| 44 | 
            -
                end
         | 
| 45 | 
            -
              end
         | 
| 46 | 
            -
             | 
| 47 | 
            -
              def find_breakpoints(string)
         | 
| 48 | 
            -
                @literal_tokens ||= tokens.select{|t| t.string}
         | 
| 49 | 
            -
                @breakpoints ||= Array.new(2*@literal_tokens.size + 2)
         | 
| 50 | 
            -
                @breakpoints[0] = 0
         | 
| 51 | 
            -
                @breakpoints[-1] = string.length
         | 
| 52 | 
            -
                start_point = 0
         | 
| 53 | 
            -
                @literal_tokens.each_with_index do |t, i|
         | 
| 54 | 
            -
                  @breakpoints[2*i+1], start_point = find_end_of(t, string, start_point)
         | 
| 55 | 
            -
                  @breakpoints[2*i+2] = start_point
         | 
| 56 | 
            -
                end
         | 
| 57 | 
            -
                @breakpoints
         | 
| 58 | 
            -
              end
         | 
| 59 | 
            -
             | 
| 60 | 
            -
              def tokenize! string, &block
         | 
| 61 | 
            -
                @extracted_tokens ||= {}
         | 
| 62 | 
            -
                @extracted_tokens.clear
         | 
| 63 | 
            -
                @tokens_to_extract ||= tokens.select{|t| t.extract?}
         | 
| 64 | 
            -
             | 
| 65 | 
            -
                find_breakpoints(string)
         | 
| 66 | 
            -
                last_important_bp = [@breakpoints.length, tokens.size].min
         | 
| 67 | 
            -
                (0...last_important_bp).each do |i|
         | 
| 68 | 
            -
                  tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]]
         | 
| 69 | 
            -
                end
         | 
| 70 | 
            -
             | 
| 71 | 
            -
                @tokens_to_extract.each do |t|
         | 
| 72 | 
            -
                  @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
         | 
| 73 | 
            -
                end
         | 
| 74 | 
            -
             | 
| 75 | 
            -
                combined_tokens.each do |combiner|
         | 
| 76 | 
            -
                  name = combiner[0]
         | 
| 77 | 
            -
                  from = @tokens[combiner[1]].breakpoints[0]
         | 
| 78 | 
            -
                  to = @tokens[combiner[2]].breakpoints[1]
         | 
| 79 | 
            -
                  @extracted_tokens[name] = string[from...to]
         | 
| 80 | 
            -
                end
         | 
| 81 | 
            -
             | 
| 82 | 
            -
                if block_given?
         | 
| 83 | 
            -
                  yield @extracted_tokens
         | 
| 84 | 
            -
                end
         | 
| 85 | 
            -
             | 
| 86 | 
            -
                # return self for chaining
         | 
| 87 | 
            -
                self
         | 
| 88 | 
            -
              end
         | 
| 89 | 
            -
             | 
| 90 | 
            -
              protected
         | 
| 91 | 
            -
             | 
| 92 | 
            -
              def find_end_of token, string, start_at
         | 
| 93 | 
            -
                start = string.index(token.string, start_at+1) || string.length
         | 
| 94 | 
            -
                [start, [start + token.string.length, string.length].min]
         | 
| 95 | 
            -
              end
         | 
| 96 | 
            -
             | 
| 97 | 
            -
            end
         | 
| 98 | 
            -
             |