word_count_analyzer 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -16
- data/lib/word_count_analyzer/ellipsis.rb +1 -1
- data/lib/word_count_analyzer/slash.rb +3 -0
- data/lib/word_count_analyzer/version.rb +1 -1
- data/spec/word_count_analyzer/counter_spec.rb +6 -0
- metadata +1 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: b715b4c2304ae956644121693d584b0b37638845
         | 
| 4 | 
            +
              data.tar.gz: 3b6d753132d5b9e511378ec06f990efa21fcc86b
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 504f3e98b336ebcce4129b137e5293f3fbe38593fd935d55b56d9807f77c17a7e5ae955c4fba41fde8fe8a9e725232253038e5b912ff892eed8ece89f1819b34
         | 
| 7 | 
            +
              data.tar.gz: 2ae63d08ea641d00f60f4d83f07e1a376cd51cdc632bafcf96a1c3e6b7b173c5e970625fa51749b879893e0386a202bebf799448703373cccd6c855bf8c56040
         | 
    
        data/README.md
    CHANGED
    
    | @@ -151,6 +151,8 @@ WordCountAnalyzer::Counter.new( | |
| 151 151 |  | 
| 152 152 | 
             
            ##### `date`
         | 
| 153 153 | 
             
              **default** = `'no_special_treatment'`
         | 
| 154 | 
            +
            - `'no_special_treatment'`   
         | 
| 155 | 
            +
              Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings.
         | 
| 154 156 | 
             
            - `'count_as_one'`  
         | 
| 155 157 | 
             
              Counts a date as one word. This is more commonly seen in translation CAT tools where a date is thought of as a *placeable* that can usually be automatically translated. Examples:
         | 
| 156 158 | 
             
              - Monday, April 4th, 2011 (1 word)
         | 
| @@ -163,8 +165,6 @@ WordCountAnalyzer::Counter.new( | |
| 163 165 | 
             
              - 2003 November 9 (1 word)
         | 
| 164 166 | 
             
              - 2003-Nov-9 (1 word)
         | 
| 165 167 | 
             
              - and others...
         | 
| 166 | 
            -
            - `'no_special_treatment'`   
         | 
| 167 | 
            -
              Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings.
         | 
| 168 168 |  | 
| 169 169 | 
             
            <hr>
         | 
| 170 170 |  | 
| @@ -197,15 +197,15 @@ WordCountAnalyzer::Counter.new( | |
| 197 197 |  | 
| 198 198 | 
             
            ##### `forward_slash`
         | 
| 199 199 | 
             
              **default** = `'count_as_multiple_except_dates'`
         | 
| 200 | 
            -
            - `'count_as_one'`  
         | 
| 201 | 
            -
              Counts any tokens that include a forward slash as one word. Example:
         | 
| 202 | 
            -
              - she/he/it (1 word)
         | 
| 203 | 
            -
            - `'count_as_multiple'`   
         | 
| 204 | 
            -
              Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example:
         | 
| 205 | 
            -
              - she/he/it (3 words)
         | 
| 206 200 | 
             
            - `'count_as_multiple_except_dates'`   
         | 
| 207 201 | 
             
              Separates any tokens that include a forward slash (except dates) at the slash(s) and counts each token individually. Example:
         | 
| 208 202 | 
             
              - she/he/it 4/25/2014 (4 words)  
         | 
| 203 | 
            +
            - `'count_as_multiple'`   
         | 
| 204 | 
            +
              Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example:
         | 
| 205 | 
            +
              - she/he/it (3 words)  
         | 
| 206 | 
            +
            - `'count_as_one'`  
         | 
| 207 | 
            +
              Counts any tokens that include a forward slash as one word. Example:
         | 
| 208 | 
            +
              - she/he/it (1 word)
         | 
| 209 209 |  | 
| 210 210 | 
             
            <hr>
         | 
| 211 211 |  | 
| @@ -222,37 +222,37 @@ WordCountAnalyzer::Counter.new( | |
| 222 222 |  | 
| 223 223 | 
             
            ##### `dotted_line`
         | 
| 224 224 | 
             
              **default** = `'ignore'`
         | 
| 225 | 
            -
            - `'count'`  
         | 
| 226 | 
            -
              Counts a dotted line as one word.
         | 
| 227 225 | 
             
            - `'ignore'`   
         | 
| 228 226 | 
             
              Ignores any dotted lines in the string and does not count them towards the word count.
         | 
| 227 | 
            +
            - `'count'`  
         | 
| 228 | 
            +
              Counts a dotted line as one word.
         | 
| 229 229 |  | 
| 230 230 | 
             
            <hr>
         | 
| 231 231 |  | 
| 232 232 | 
             
            ##### `dashed_line`
         | 
| 233 233 | 
             
              **default** = `'ignore'`
         | 
| 234 | 
            +
            - `'ignore'`   
         | 
| 235 | 
            +
              Ignores any dashed lines in the string and does not count them towards the word count.  
         | 
| 234 236 | 
             
            - `'count'`  
         | 
| 235 237 | 
             
              Counts a dashed line as one word.
         | 
| 236 | 
            -
            - `'ignore'`   
         | 
| 237 | 
            -
              Ignores any dashed lines in the string and does not count them towards the word count.
         | 
| 238 238 |  | 
| 239 239 | 
             
            <hr>
         | 
| 240 240 |  | 
| 241 241 | 
             
            ##### `underscore`
         | 
| 242 242 | 
             
              **default** = `'ignore'`
         | 
| 243 | 
            +
            - `'ignore'`   
         | 
| 244 | 
            +
              Ignores any series of underscores in the string and does not count them towards the word count.     
         | 
| 243 245 | 
             
            - `'count'`  
         | 
| 244 246 | 
             
              Counts a series of underscores as one word.
         | 
| 245 | 
            -
            - `'ignore'`   
         | 
| 246 | 
            -
              Ignores any series of underscores in the string and does not count them towards the word count.      
         | 
| 247 247 |  | 
| 248 248 | 
             
            <hr>
         | 
| 249 249 |  | 
| 250 250 | 
             
            ##### `stray_punctuation`
         | 
| 251 251 | 
             
              **default** = `'ignore'`
         | 
| 252 | 
            +
            - `'ignore'`   
         | 
| 253 | 
            +
              Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count.    
         | 
| 252 254 | 
             
            - `'count'`  
         | 
| 253 255 | 
             
              Counts a punctuation mark surrounded on both sides by a whitespace as one word.
         | 
| 254 | 
            -
            - `'ignore'`   
         | 
| 255 | 
            -
              Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count.     
         | 
| 256 256 |  | 
| 257 257 | 
             
            ### Gray Area Details
         | 
| 258 258 |  | 
| @@ -9,7 +9,7 @@ module WordCountAnalyzer | |
| 9 9 | 
             
                # Rubular: http://rubular.com/r/2VvZ8wRbd8
         | 
| 10 10 | 
             
                FOUR_SPACE_REGEX = /(?<=[a-z])(\.\s){3}\.(\z|$|\n)/
         | 
| 11 11 |  | 
| 12 | 
            -
                OTHER_THREE_PERIOD_REGEX = /[^\.]\.{3}([^\.]|$)/
         | 
| 12 | 
            +
                OTHER_THREE_PERIOD_REGEX = /(?<=[^\.])\.{3}(?=([^\.]|$))/
         | 
| 13 13 |  | 
| 14 14 | 
             
                UNICODE_ELLIPSIS = /(?<=[^…])…{1}(?=[^…])/
         | 
| 15 15 |  | 
| @@ -60,6 +60,7 @@ module WordCountAnalyzer | |
| 60 60 | 
             
                  processed_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
         | 
| 61 61 | 
             
                    match.split(/\/+/).join(' ')
         | 
| 62 62 | 
             
                  end
         | 
| 63 | 
            +
                  processed_string
         | 
| 63 64 | 
             
                end
         | 
| 64 65 |  | 
| 65 66 | 
             
                def replace_forward_slashes_except_dates
         | 
| @@ -68,6 +69,7 @@ module WordCountAnalyzer | |
| 68 69 | 
             
                  except_date_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
         | 
| 69 70 | 
             
                    match.split(/\/+/).join(' ')
         | 
| 70 71 | 
             
                  end
         | 
| 72 | 
            +
                  except_date_string
         | 
| 71 73 | 
             
                end
         | 
| 72 74 |  | 
| 73 75 | 
             
                def backslash_occurences
         | 
| @@ -79,6 +81,7 @@ module WordCountAnalyzer | |
| 79 81 | 
             
                  processed_string.gsub!(BACKSLASH_REGEX).each do |match|
         | 
| 80 82 | 
             
                    ' word ' * match.split(/\\+/).length
         | 
| 81 83 | 
             
                  end
         | 
| 84 | 
            +
                  processed_string
         | 
| 82 85 | 
             
                end
         | 
| 83 86 | 
             
              end
         | 
| 84 87 | 
             
            end
         | 
| @@ -553,6 +553,12 @@ RSpec.describe WordCountAnalyzer::Counter do | |
| 553 553 | 
             
                expect(ws.count).to eq(66)
         | 
| 554 554 | 
             
              end
         | 
| 555 555 |  | 
| 556 | 
            +
              it 'counts the words in a string #005' do
         | 
| 557 | 
            +
                text = "Hello world... 11/22/2013"
         | 
| 558 | 
            +
                ws = WordCountAnalyzer::Counter.new(text: text)
         | 
| 559 | 
            +
                expect(ws.count).to eq(3)
         | 
| 560 | 
            +
              end
         | 
| 561 | 
            +
             | 
| 556 562 | 
             
              context 'Pages Word Count' do
         | 
| 557 563 | 
             
                it 'reverse engineers Pages word count #001' do
         | 
| 558 564 | 
             
                  text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. <html> Some HTML and a hyphenated-word</html>. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
         |