hashtag_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 4c7d5f6c5c41e2d5963e60b8a8491267a0cff44ec509c4e9f0613f002adee10c
4
+ data.tar.gz: 67347bfead8c1283f28c72853f799315f310aea950e2b35e150a50c3e91feebb
5
+ SHA512:
6
+ metadata.gz: 7d58badaaac7976181e3c41491363941c030ef8a8892364f300985684653bc13a422bbdcafcc3753c2b80c986ffb0771106171aebbe8af3e6c1a3df1c4822d0e
7
+ data.tar.gz: 997f35e7ceb0f9b057a292ad3680b92a844fea9efebb99a08ab9ba7545b3f62c28b42802c037d0156084ef6f92d0e62aa9a4be9820ebf82d4a4a586a6f6b1a05
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ .byebug_history
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ ---
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.6.5
6
+ before_install: gem install bundler -v 2.1.4
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in hashtag_parser.gemspec
4
+ gemspec
5
+
6
+ gem "rake", "~> 12.0"
7
+ gem "minitest", "~> 5.0"
data/Gemfile.lock ADDED
@@ -0,0 +1,23 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ hashtag_parser (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ byebug (11.1.1)
10
+ minitest (5.14.0)
11
+ rake (12.3.3)
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ byebug
18
+ hashtag_parser!
19
+ minitest (~> 5.0)
20
+ rake (~> 12.0)
21
+
22
+ BUNDLED WITH
23
+ 2.1.4
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Mariano Vallés
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,67 @@
1
+ # HashtagParser
2
+
3
+ [![Build Status](https://travis-ci.org/zucaritask/hashtag_parser.svg?branch=master)](https://travis-ci.org/zucaritask/hashtag_parser)
4
+
5
+ A hashtag parser that brings the behavior of [hashtag-rs](https://github.com/tonsser/hashtag-rs)
6
+ to ruby and the work of using an Finite State Machine to do the parsing by [davidpdrsn](https://github.com/davidpdrsn)
7
+
8
+ Given a string as `"#ruby is #awesome"`, this gem returns an array with an object with `text`, `start` and `end`
9
+ for each of the two hashtags in the string.
10
+
11
+ Its goal is to match Instagram's parsing of hashtags.
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ ```ruby
18
+ gem 'hashtag_parser'
19
+ ```
20
+
21
+ And then execute:
22
+
23
+ $ bundle install
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install hashtag_parser
28
+
29
+ ## Usage
30
+
31
+ ```ruby
32
+ hashtags = HashtagParser.parse("#ruby is #awesome")
33
+ ```
34
+
35
+ `hashtags` is going to be:
36
+
37
+ ```
38
+ [
39
+ {
40
+ text: "ruby",
41
+ start: 0,
42
+ end: 4
43
+ },
44
+ {
45
+ text: "awesome",
46
+ start: 9,
47
+ end: 16
48
+ }
49
+ ]
50
+ ```
51
+ ## Contact
52
+
53
+ Twitter: [@zucaritask](http://twitter.com/zucaritask)
54
+
55
+ Github: [@zucaritask](http://github.com/zucaritask)
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it
60
+ 1. Create your feature branch (git checkout -b my-new-feature)
61
+ 1. Commit your changes (git commit -am 'Add some feature')
62
+ 1. Push to the branch (git push origin my-new-feature)
63
+ 1. Create new Pull Request
64
+
65
+ ## License
66
+
67
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "hashtag_parser"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,29 @@
1
+ require_relative 'lib/hashtag_parser.rb'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "hashtag_parser"
5
+ spec.version = HashtagParser::VERSION
6
+ spec.authors = ["Mariano Vallés"]
7
+ spec.email = ["zucaritas@gmail.com"]
8
+
9
+ spec.summary = "Hashtag parser returning start, end and text of each hashtag"
10
+ spec.description = "Hashtag parser inspired by https://github.com/tonsser/hashtag-rs/\n"\
11
+ "Given a string, this gem returns an array with an object with `text`, `start` and `end`.\n"\
12
+ "Its goal is to match Instagram's parsing of hashtags."
13
+ spec.homepage = "https://github.com/zucaritask/hashtag_parser"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/zucaritask/hashtag_parser"
19
+
20
+ # Specify which files should be added to the gem when it is released.
21
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
23
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
24
+ end
25
+ spec.bindir = "exe"
26
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ["lib"]
28
+ spec.add_development_dependency "byebug"
29
+ end
@@ -0,0 +1,51 @@
1
+ require_relative "./state_machine"
2
+ require_relative "./tokenizer"
3
+ require_relative "./token"
4
+
5
+ class HashtagParser
6
+
7
+ VERSION = "0.1.0"
8
+
9
+ def self.parse(text)
10
+ tokens = Tokenizer.tokenize(text)
11
+ state_machine = StateMachine.new
12
+ tokens.each_with_index do |token, index|
13
+ next_token = tokens[index + 1]
14
+ index = index - 1 unless token.type == :sos
15
+
16
+ case token.type
17
+ when :sos
18
+ state_machine.hashtag_incoming if next_token.hashtag?
19
+ when :hashtag
20
+ if state_machine.parsing_hashtag?
21
+ if next_token.end_of_hashtag?
22
+ state_machine.reset_state
23
+ else
24
+ state_machine.hashtag_seen_at(index)
25
+ end
26
+ end
27
+ when :char
28
+ if state_machine.parsing_hashtag?
29
+ if token.end_of_hashtag?
30
+ state_machine.hashtag_finishes_at(index - 1)
31
+ else
32
+ state_machine.consume_char(token.char)
33
+ end
34
+
35
+ if next_token.hashtag?
36
+ state_machine.hashtag_finishes_at(index)
37
+ state_machine.hashtag_incoming
38
+ end
39
+ end
40
+ when :whitespace
41
+ state_machine.hashtag_finishes_at(index - 1) if state_machine.parsing_hashtag?
42
+ state_machine.hashtag_incoming if next_token.hashtag?
43
+ when :eos
44
+ if state_machine.parsing_hashtag?
45
+ state_machine.hashtag_finishes_at(index - 1)
46
+ end
47
+ end
48
+ end
49
+ state_machine.hashtags
50
+ end
51
+ end
@@ -0,0 +1,55 @@
1
+ class StateMachine
2
+ attr_accessor(
3
+ :start_index,
4
+ :parsing_hashtag,
5
+ :buffer,
6
+ :hashtags,
7
+ :consumed_anything
8
+ )
9
+
10
+ def initialize
11
+ set_initial_defaults
12
+ @hashtags = []
13
+ end
14
+
15
+ def set_initial_defaults
16
+ @parsing_hashtag = false
17
+ @start_index = 0
18
+ @buffer = []
19
+ @consumed_anything = false
20
+ end
21
+
22
+ def consume_char(char)
23
+ buffer << char
24
+ @consumed_anything = true
25
+ end
26
+
27
+ def hashtag_seen_at(index)
28
+ @start_index = index
29
+ end
30
+
31
+ def hashtag_incoming
32
+ @parsing_hashtag = true
33
+ end
34
+
35
+ def hashtag_finishes_at(index)
36
+ if consumed_anything
37
+ hashtags.push(
38
+ {
39
+ text: buffer.join,
40
+ start: start_index,
41
+ end: index
42
+ }
43
+ )
44
+ end
45
+ reset_state
46
+ end
47
+
48
+ def parsing_hashtag?
49
+ parsing_hashtag
50
+ end
51
+
52
+ def reset_state
53
+ set_initial_defaults
54
+ end
55
+ end
data/lib/token.rb ADDED
@@ -0,0 +1,59 @@
1
+ class Token
2
+ END_OF_HASHTAG_REGEX = /[\\¿¡&%#\s!@$'^&*().,\-<>\/|\[\]{}`~=+;?£€•´:]/
3
+
4
+ attr_reader :char, :type, :index
5
+
6
+ def initialize(char, index)
7
+ @index = index
8
+ case char
9
+ when "#"
10
+ @type = :hashtag
11
+ when " ", "\n", "\r", "\t"
12
+ @type = :whitespace
13
+ else
14
+ @type = :char
15
+ @char = char
16
+ end
17
+ end
18
+
19
+ def hashtag?
20
+ @type == :hashtag
21
+ end
22
+
23
+ def end_of_hashtag?
24
+ case type
25
+ when :char
26
+ END_OF_HASHTAG_REGEX.match?(char)
27
+ when :hashtag
28
+ false
29
+ else
30
+ true
31
+ end
32
+ end
33
+
34
+ def as_json
35
+ {
36
+ type: type,
37
+ index: index,
38
+ char: char
39
+ }
40
+ end
41
+ end
42
+
43
+ class StartOfString < Token
44
+ def initialize(index)
45
+ @type = :sos
46
+ @index = index
47
+ end
48
+
49
+ def end_of_hashtag?
50
+ false
51
+ end
52
+ end
53
+
54
+ class EndOfStringToken < Token
55
+ def initialize(index)
56
+ @type = :eos
57
+ @index = index
58
+ end
59
+ end
data/lib/tokenizer.rb ADDED
@@ -0,0 +1,9 @@
1
+ class Tokenizer
2
+ def self.tokenize(text)
3
+ tokens = [StartOfString.new(-1)]
4
+ text.split("").each_with_index do |char, index|
5
+ tokens << Token.new(char, index)
6
+ end
7
+ tokens << EndOfStringToken.new(text.size)
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hashtag_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Mariano Vallés
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-04-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: byebug
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: |-
28
+ Hashtag parser inspired by https://github.com/tonsser/hashtag-rs/
29
+ Given a string, this gem returns an array with an object with `text`, `start` and `end`.
30
+ Its goal is to match Instagram's parsing of hashtags.
31
+ email:
32
+ - zucaritas@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - ".gitignore"
38
+ - ".travis.yml"
39
+ - Gemfile
40
+ - Gemfile.lock
41
+ - LICENSE.txt
42
+ - README.md
43
+ - Rakefile
44
+ - bin/console
45
+ - bin/setup
46
+ - hashtag_parser.gemspec
47
+ - lib/hashtag_parser.rb
48
+ - lib/state_machine.rb
49
+ - lib/token.rb
50
+ - lib/tokenizer.rb
51
+ homepage: https://github.com/zucaritask/hashtag_parser
52
+ licenses:
53
+ - MIT
54
+ metadata:
55
+ homepage_uri: https://github.com/zucaritask/hashtag_parser
56
+ source_code_uri: https://github.com/zucaritask/hashtag_parser
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: 2.3.0
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubygems_version: 3.0.3
73
+ signing_key:
74
+ specification_version: 4
75
+ summary: Hashtag parser returning start, end and text of each hashtag
76
+ test_files: []