quesadilla 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ # encoding: UTF-8
2
+
3
+ module Quesadilla
4
+ # Extract entities from text
5
+ class Extractor
6
+ require 'quesadilla/core_ext/string'
7
+ Dir[File.expand_path('../extractor/*.rb', __FILE__)].each { |f| require f }
8
+
9
+ include Autolinks
10
+ include Emoji
11
+ include Hashtags
12
+ include HTML
13
+ include Markdown
14
+
15
+ # @return [Hash] default extractor options
16
+ def self.default_options
17
+ {
18
+ markdown: true,
19
+ markdown_code: true,
20
+ markdown_links: true,
21
+ markdown_triple_emphasis: true,
22
+ markdown_double_emphasis: true,
23
+ markdown_emphasis: true,
24
+ markdown_strikethrough: true,
25
+ hashtags: true,
26
+ autolinks: true,
27
+ emoji: true,
28
+ html: true,
29
+ html_renderer: Quesadilla::HTMLRenderer
30
+ }
31
+ end
32
+
33
+ # @param options [Hash] an optional options hash. Defaults to `Quesadilla::Extractor.default_options`.
34
+ # @option options [Boolean] Should extract Markdown. Defaults to `true`.
35
+ # @option options markdown_code [Boolean] Should extract Markdown code. Defaults to `true`.
36
+ # @option options markdown_links [Boolean] Should extract Markdown links. Defaults to `true`.
37
+ # @option options markdown_triple_emphasis [Boolean] Should extract Markdown triple emphasis (bold italic). Defaults to `true`.
38
+ # @option options markdown_double_emphasis [Boolean] Should extract Markdown double emphasis (bold). Defaults to `true`.
39
+ # @option options markdown_emphasis [Boolean] Should extract Markdown emphasis (italic). Defaults to `true`.
40
+ # @option options markdown_strikethrough [Boolean] Should extract Markdown strikethrough. Defaults to `true`.
41
+ # @option options hashtags [Boolean] Should extract hashtags. Defaults to `true`.
42
+ # @option options autolinks [Boolean] Should automatically detect links. Defaults to `true`.
43
+ # @option options emoji [Boolean] Should extract named emoji. Defaults to `true`.
44
+ # @option options html [Boolean] Should generate HTML. Defaults to `true`.
45
+ # @option options html_renderer [Class] class to use as HTML renderer. Defaults to `Quesadilla::HTMLRenderer`.
46
+ def initialize(options = {})
47
+ @options = self.class.default_options.merge(options)
48
+ @renderer = @options[:html_renderer].new if @options[:html]
49
+ end
50
+
51
+ # Extract entities from text
52
+ # @param original_text the text to extract from
53
+ # @return [Hash] hash containing the display text, html text, and entities
54
+ def extract(original_text)
55
+ @original_text = original_text.dup
56
+
57
+ # Emoji colon-syntax
58
+ replace_emoji if @options[:emoji]
59
+
60
+ @working_text = @original_text.dup
61
+ @entities = []
62
+
63
+ # Get entities
64
+ extract_markdown if @options[:markdown]
65
+ extract_hashtags if @options[:hashtags]
66
+ extract_autolinks if @options[:autolinks]
67
+
68
+ # Sort entities
69
+ @entities.sort! do |a, b|
70
+ a[:indices].first <=> b[:indices].first
71
+ end
72
+
73
+ # Adjust display for each entity
74
+ display_text = sub_entities(@original_text, @entities)
75
+
76
+ # Return
77
+ hash = {
78
+ display_text: display_text,
79
+ entities: @entities
80
+ }
81
+ hash[:display_html] = display_html(display_text, @entities) if @options[:html]
82
+ hash
83
+ end
84
+
85
+ private
86
+
87
+ # Invisible character from the reserved range replaces markdown we've already parsed.
88
+ REPLACE_TOKEN = "\uf042".freeze
89
+
90
+ def display_url(url)
91
+ url = url.gsub(/(?:https?:\/\/)?(?:www\.)?/i, '').q_truncate(32, omission: '…')
92
+ url = url[0...(url.length - 1)] if url[-1, 1] == '/'
93
+ url
94
+ end
95
+
96
+ def quality_url(url)
97
+ return url if url.include?('://')
98
+ 'http://' + url
99
+ end
100
+
101
+ def sub_entities(input_text, entities, display = false, &block)
102
+ # Adjust output text for each entity
103
+ output_text = input_text
104
+ offset = 0
105
+ entities.each do |entity|
106
+ entity_original_text = display ? entity[:display_text] : entity[:text]
107
+ entity_display_text = if block_given?
108
+ yield(entity)
109
+ else
110
+ entity[:display_text]
111
+ end
112
+
113
+ indices = display ? entity[:display_indices] : entity[:indices]
114
+
115
+ # Use the entity's display text instead of original text if they're different
116
+ unless entity_original_text == entity_display_text
117
+ # Get the fragment before the entity
118
+ bf_end = indices[0] - 1 - offset
119
+ before_frag = bf_end <= 0 ? '' : output_text[0..bf_end]
120
+
121
+ # Get the fragment after the entity
122
+ af_start = indices[1] - offset
123
+ af_end = output_text.length - 1
124
+ after_frag = af_start > af_end ? '' : output_text[af_start..af_end]
125
+
126
+ # Update the output text
127
+ output_text = before_frag + entity_display_text + after_frag
128
+ end
129
+
130
+ # Update offset
131
+ adjust = entity_original_text.length - entity_display_text.length
132
+ unless display
133
+ entity[:display_indices] = [entity[:indices][0] - offset, entity[:indices][1] - offset - adjust]
134
+ end
135
+ offset += adjust
136
+ end
137
+ output_text
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,57 @@
1
+ module Quesadilla
2
+ # Default HTML renderer for generating HTML
3
+ class HTMLRenderer
4
+ # HTML representation of italic text
5
+ # @param display_text the italic text
6
+ # @return [String] HTML representation of the italic text
7
+ def emphasis(display_text)
8
+ %Q{<em>#{display_text}</em>}
9
+ end
10
+
11
+ # HTML representation of bold text
12
+ # @param display_text the bold text
13
+ # @return [String] HTML representation of the bold text
14
+ def double_emphasis(display_text)
15
+ %Q{<strong>#{display_text}</strong>}
16
+ end
17
+
18
+ # HTML representation of bold italic text
19
+ # @param display_text the bold italic text
20
+ # @return [String] HTML representation of the bold italic text
21
+ def triple_emphasis(display_text)
22
+ %Q{<strong><em>#{display_text}</em></strong>}
23
+ end
24
+
25
+ # HTML representation of strikethrough text
26
+ # @param display_text the strikethrough text
27
+ # @return [String] HTML representation of the strikethrough text
28
+ def strikethrough(display_text)
29
+ %Q{<del>#{display_text}</del>}
30
+ end
31
+
32
+ # HTML representation of code
33
+ # @param display_text the text of the code
34
+ # @return [String] HTML representation of the code
35
+ def code(display_text)
36
+ %Q{<code>#{display_text}</code>}
37
+ end
38
+
39
+ # HTML representation of a hashtag
40
+ # @param display_text the hashtag text (`#awesome`)
41
+ # @param hashtag the hashtag (just `awesome`)
42
+ # @return [String] HTML representation of the hashtag
43
+ def hashtag(display_text, hashtag)
44
+ %Q{<a href="#hashtag-#{hashtag}" class="hashtag">#{display_text}</a>}
45
+ end
46
+
47
+ # HTML representation of a link
48
+ # @param display_text the text of the link
49
+ # @param url the url of the link
50
+ # @param title the title of the link
51
+ # @return [String] HTML representation of the link
52
+ def link(display_text, url, title = nil)
53
+ title_attr = (title && title.length > 0) ? %Q{ title="#{title}"} : ''
54
+ %Q{<a href="#{url}" rel="external nofollow" class="link"#{title_attr}>#{display_text}</a>}
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,4 @@
1
+ module Quesadilla
2
+ # Version of the Quesadilla gem
3
+ VERSION = '0.1.0'.freeze
4
+ end
data/lib/quesadilla.rb ADDED
@@ -0,0 +1,45 @@
1
+ require 'quesadilla/version'
2
+ require 'quesadilla/html_renderer'
3
+ require 'quesadilla/extractor'
4
+
5
+ # Ruby library for entity-style text parsing. Quesadilla was extracted from [Cheddar](https://cheddarapp.com).
6
+ module Quesadilla
7
+ # Emphasis (italic) entity type
8
+ ENTITY_TYPE_EMPHASIS = 'emphasis'.freeze
9
+
10
+ # Double emphasis (bold) entity type
11
+ ENTITY_TYPE_DOUBLE_EMPHASIS = 'double_emphasis'.freeze
12
+
13
+ # Triple emphasis (bold italic) entity type
14
+ ENTITY_TYPE_TRIPLE_EMPHASIS = 'triple_emphasis'.freeze
15
+
16
+ # Strikethrough entity type
17
+ ENTITY_TYPE_STRIKETHROUGH = 'strikethrough'.freeze
18
+
19
+ # Code entity type
20
+ ENTITY_TYPE_CODE = 'code'.freeze
21
+
22
+ # Hashtag entity type
23
+ ENTITY_TYPE_HASHTAG = 'hashtag'.freeze
24
+
25
+ # Link entity type
26
+ ENTITY_TYPE_LINK = 'link'.freeze
27
+
28
+ # Extract entities from text
29
+ # @param text the text to extract
30
+ # @option options markdown_code [Boolean] Should extract Markdown code. Defaults to `true`.
31
+ # @option options markdown_links [Boolean] Should extract Markdown links. Defaults to `true`.
32
+ # @option options markdown_triple_emphasis [Boolean] Should extract Markdown triple emphasis (bold italic). Defaults to `true`.
33
+ # @option options markdown_double_emphasis [Boolean] Should extract Markdown double emphasis (bold). Defaults to `true`.
34
+ # @option options markdown_emphasis [Boolean] Should extract Markdown emphasis (italic). Defaults to `true`.
35
+ # @option options markdown_strikethrough [Boolean] Should extract Markdown strikethrough. Defaults to `true`.
36
+ # @option options hashtags [Boolean] Should extract hashtags. Defaults to `true`.
37
+ # @option options autolinks [Boolean] Should automatically detect links. Defaults to `true`.
38
+ # @option options emoji [Boolean] Should extract named emoji. Defaults to `true`.
39
+ # @option options html [Boolean] Should generate HTML. Defaults to `true`.
40
+ # @option options html_renderer [Class] class to use as HTML renderer. Defaults to `Quesadilla::HTMLRenderer`.
41
+ # @return [Hash] hash containing the display text, html text, and entities
42
+ def self.extract(text, options = {})
43
+ Extractor.new(options).extract(text)
44
+ end
45
+ end
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'quesadilla/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = 'quesadilla'
8
+ gem.version = Quesadilla::VERSION
9
+ gem.authors = ['Sam Soffes']
10
+ gem.email = ['sam@soff.es']
11
+ gem.description = 'Entity-style text parsing'
12
+ gem.summary = gem.description
13
+ gem.homepage = 'https://github.com/soffes/quesadilla'
14
+ gem.license = 'MIT'
15
+
16
+ gem.files = `git ls-files`.split($/)
17
+ gem.executables = gem.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
19
+ gem.require_paths = ['lib']
20
+
21
+ gem.required_ruby_version = '>= 1.9.3'
22
+
23
+ # Hashtag and autolink parsing
24
+ gem.add_dependency 'twitter-text', '~> 1.5.0'
25
+
26
+ # Emoji detection
27
+ gem.add_dependency 'named_emoji', '~> 1.1.1'
28
+ end
@@ -0,0 +1,84 @@
1
+ # encoding: UTF-8
2
+ require 'test_helper'
3
+
4
+ module Quesadilla
5
+ class AutolinkTest < TestCase
6
+ def test_that_it_extracts_plain_links
7
+ extraction = extract('Something with a link: http://samsoff.es/posts/hire-sam')
8
+ assert_equal extraction, {
9
+ display_text: 'Something with a link: samsoff.es/posts/hire-sam',
10
+ display_html: 'Something with a link: <a href="http://samsoff.es/posts/hire-sam" rel="external nofollow" class="link">samsoff.es&#x2F;posts&#x2F;hire-sam</a>',
11
+ entities: [
12
+ {
13
+ type: 'link',
14
+ text: 'http://samsoff.es/posts/hire-sam',
15
+ display_text: 'samsoff.es/posts/hire-sam',
16
+ url: 'http://samsoff.es/posts/hire-sam',
17
+ indices: [23, 55],
18
+ display_indices: [23, 48]
19
+ }
20
+ ]
21
+ }
22
+
23
+ extraction = extract('Try google.com')
24
+ assert_equal extraction, {
25
+ display_text: 'Try google.com',
26
+ display_html: 'Try <a href="http://google.com" rel="external nofollow" class="link">google.com</a>',
27
+ entities: [
28
+ {
29
+ type: 'link',
30
+ text: 'google.com',
31
+ display_text: 'google.com',
32
+ url: 'http://google.com',
33
+ indices: [4, 14],
34
+ display_indices: [4, 14]
35
+ }
36
+ ]
37
+ }
38
+ end
39
+
40
+ def test_that_it_pretifies_long_links
41
+ extraction = extract('Something with a long link: https://github.com/samsoffes/api.cheddarapp.com/blob/master/Readme.markdown')
42
+ assert_equal extraction, {
43
+ display_text: 'Something with a long link: github.com/samsoffes/api.chedda…',
44
+ display_html: 'Something with a long link: <a href="https://github.com/samsoffes/api.cheddarapp.com/blob/master/Readme.markdown" rel="external nofollow" class="link">github.com&#x2F;samsoffes&#x2F;api.chedda…</a>',
45
+ entities: [
46
+ {
47
+ type: 'link',
48
+ text: 'https://github.com/samsoffes/api.cheddarapp.com/blob/master/Readme.markdown',
49
+ display_text: 'github.com/samsoffes/api.chedda…',
50
+ url: 'https://github.com/samsoffes/api.cheddarapp.com/blob/master/Readme.markdown',
51
+ indices: [28, 103],
52
+ display_indices: [28, 60]
53
+ }
54
+ ]
55
+ }
56
+ end
57
+
58
+ def test_that_it_extracts_multiple_plain_links
59
+ extraction = extract('Something with a link: http://samsoff.es/posts/hire-sam - http://apple.com')
60
+ assert_equal extraction, {
61
+ display_text: 'Something with a link: samsoff.es/posts/hire-sam - apple.com',
62
+ display_html: 'Something with a link: <a href="http://samsoff.es/posts/hire-sam" rel="external nofollow" class="link">samsoff.es&#x2F;posts&#x2F;hire-sam</a> - <a href="http://apple.com" rel="external nofollow" class="link">apple.com</a>',
63
+ entities: [
64
+ {
65
+ type: 'link',
66
+ text: 'http://samsoff.es/posts/hire-sam',
67
+ display_text: 'samsoff.es/posts/hire-sam',
68
+ url: 'http://samsoff.es/posts/hire-sam',
69
+ indices: [23, 55],
70
+ display_indices: [23, 48]
71
+ },
72
+ {
73
+ type: 'link',
74
+ text: 'http://apple.com',
75
+ display_text: 'apple.com',
76
+ url: 'http://apple.com',
77
+ indices: [58, 74],
78
+ display_indices: [51, 60]
79
+ }
80
+ ]
81
+ }
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,103 @@
1
+ # encoding: UTF-8
2
+ require 'test_helper'
3
+
4
+ module Quesadilla
5
+ class EmojiTest < TestCase
6
+ def test_that_it_supports_emoji
7
+ extraction = extract('Something with 👨 beardface')
8
+ assert_equal extraction, {
9
+ display_text: 'Something with 👨 beardface',
10
+ display_html: 'Something with 👨 beardface',
11
+ entities: []
12
+ }
13
+ end
14
+
15
+ def test_that_it_supports_emoji_with_other_entities
16
+ extraction = extract('Something #tagged with 👨 beardface')
17
+ assert_equal extraction, {
18
+ display_text: 'Something #tagged with 👨 beardface',
19
+ display_html: 'Something <a href="#hashtag-tagged" class="hashtag">#tagged</a> with 👨 beardface',
20
+ entities: [
21
+ {
22
+ type: 'hashtag',
23
+ text: '#tagged',
24
+ display_text: '#tagged',
25
+ hashtag: 'tagged',
26
+ indices: [10, 17],
27
+ display_indices: [10, 17]
28
+ }
29
+ ]
30
+ }
31
+
32
+ extraction = extract('After 💇 #foo 👮 **Yep**')
33
+ assert_equal extraction, {
34
+ display_text: 'After 💇 #foo 👮 Yep',
35
+ display_html: 'After 💇 <a href="#hashtag-foo" class="hashtag">#foo</a> 👮 <strong>Yep</strong>',
36
+ entities: [
37
+ {
38
+ type: 'hashtag',
39
+ text: '#foo',
40
+ display_text: '#foo',
41
+ indices: [8, 12],
42
+ hashtag: 'foo',
43
+ display_indices: [8, 12]
44
+ },
45
+ {
46
+ type: 'double_emphasis',
47
+ text: '**Yep**',
48
+ display_text: 'Yep',
49
+ indices: [15, 22],
50
+ display_indices: [15, 18]
51
+ }
52
+ ]
53
+ }
54
+ end
55
+
56
+ def test_that_it_support_the_colon_syntax
57
+ extraction = extract('Beardface is :man:')
58
+ assert_equal extraction, {
59
+ display_text: 'Beardface is 👨',
60
+ display_html: 'Beardface is 👨',
61
+ entities: []
62
+ }
63
+
64
+ extraction = extract('Beardface is `not here :man:` :man:')
65
+ assert_equal extraction, {
66
+ display_text: 'Beardface is not here :man: 👨',
67
+ display_html: 'Beardface is <code>not here :man:</code> 👨',
68
+ entities: [
69
+ {
70
+ type: 'code',
71
+ text: '`not here :man:`',
72
+ display_text: 'not here :man:',
73
+ indices: [13, 29],
74
+ display_indices: [13, 27]
75
+ }
76
+ ]
77
+ }
78
+
79
+ # extraction = extract('Something #tagged with :man: **beardface**')
80
+ # assert_equal extraction, {
81
+ # display_text: 'Something #tagged with 👨 beardface',
82
+ # display_html: 'Something <a href="#hashtag-tagged" class="hashtag">#tagged</a> with 👨 <strong>beardface</strong>',
83
+ # entities: [
84
+ # {
85
+ # type: 'hashtag',
86
+ # text: '#tagged',
87
+ # display_text: '#tagged',
88
+ # hashtag: 'tagged',
89
+ # indices: [10, 17],
90
+ # display_indices: [10, 17]
91
+ # },
92
+ # {
93
+ # type: 'double_emphasis',
94
+ # text: '**beardface**',
95
+ # display_text: 'beardface',
96
+ # indices: [29, 42],
97
+ # display_indices: [30, 39]
98
+ # }
99
+ # ]
100
+ # }
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,50 @@
1
+ # encoding: UTF-8
2
+ require 'test_helper'
3
+
4
+ module Quesadilla
5
+ class HashtagsTest < TestCase
6
+ def test_that_it_extracts_tags
7
+ extraction = extract('Something #tagged')
8
+ assert_equal extraction, {
9
+ display_text: 'Something #tagged',
10
+ display_html: 'Something <a href="#hashtag-tagged" class="hashtag">#tagged</a>',
11
+ entities: [
12
+ {
13
+ type: 'hashtag',
14
+ text: '#tagged',
15
+ display_text: '#tagged',
16
+ hashtag: 'tagged',
17
+ indices: [10, 17],
18
+ display_indices: [10, 17]
19
+ }
20
+ ]
21
+ }
22
+ end
23
+
24
+ def test_that_it_extracts_multiple_tags
25
+ extraction = extract('A task with some #tags that are #awesome')
26
+ assert_equal extraction, {
27
+ display_text: 'A task with some #tags that are #awesome',
28
+ display_html: 'A task with some <a href="#hashtag-tags" class="hashtag">#tags</a> that are <a href="#hashtag-awesome" class="hashtag">#awesome</a>',
29
+ entities: [
30
+ {
31
+ type: 'hashtag',
32
+ text: '#tags',
33
+ display_text: '#tags',
34
+ hashtag: 'tags',
35
+ indices: [17, 22],
36
+ display_indices: [17, 22]
37
+ },
38
+ {
39
+ type: 'hashtag',
40
+ text: '#awesome',
41
+ display_text: '#awesome',
42
+ hashtag: 'awesome',
43
+ indices: [32, 40],
44
+ display_indices: [32, 40]
45
+ }
46
+ ]
47
+ }
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,21 @@
1
+ # encoding: UTF-8
2
+ require 'test_helper'
3
+
4
+ module Quesadilla
5
+ class CustomRenderer < HTMLRenderer
6
+ def hashtag(display_text, hashtag)
7
+ %Q{<a href="#tag-#{hashtag}" class="tag">#{display_text}</a>}
8
+ end
9
+ end
10
+
11
+
12
+ class HTMLTest < TestCase
13
+ def test_hashtag_url_format
14
+ extraction = extract('Something #tagged')
15
+ assert_equal 'Something <a href="#hashtag-tagged" class="hashtag">#tagged</a>', extraction[:display_html]
16
+
17
+ extraction = extract('Something #tagged', html_renderer: CustomRenderer)
18
+ assert_equal 'Something <a href="#tag-tagged" class="tag">#tagged</a>', extraction[:display_html]
19
+ end
20
+ end
21
+ end