tag-extractor 0.1.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/tag_extractor.rb +202 -19
- metadata +3 -3
data/lib/tag_extractor.rb
CHANGED
@@ -1,39 +1,206 @@
|
|
1
|
+
# Public: TagExtractor module contains various classes to handle tag extraction and manipulation.
|
2
|
+
# The class uses the principles of separator and containers as a way to separate tags from the
|
3
|
+
# rest of the string.
|
4
|
+
#
|
5
|
+
# Examples
|
6
|
+
#
|
7
|
+
# "#social, economy, #physics, #[web development]"
|
8
|
+
# # Here we have 3 tags : social, physics, web development.
|
9
|
+
# # '#' is the tag separator and [] are the containers,
|
10
|
+
# # needed only when tags are composed of multiple words.
|
11
|
+
#
|
12
|
+
# author - Gabriel Dehan (https://github.com/gabriel-dehan)
|
13
|
+
# documentation - https://github.com/gabriel-dehan/TagExtractor
|
14
|
+
# version - 1.0.0
|
15
|
+
#
|
16
|
+
# ______________
|
17
|
+
# < Tag Extractor >
|
18
|
+
# --------------
|
19
|
+
# \ ^__^
|
20
|
+
# \(00)\_______
|
21
|
+
# (__)\ RUBY )\/\
|
22
|
+
# ## ||----w |
|
23
|
+
# || ||
|
24
|
+
#
|
25
|
+
# A minimal ruby library for tag extraction and manipulation.
|
26
|
+
|
1
27
|
module TagExtractor
|
2
|
-
|
28
|
+
# Public: Constant to be passed to TagExtractor subclasses methods,
|
29
|
+
# allowing you to default to the Global Separator you have set through tag_separator=(separator)
|
30
|
+
GLOBAL_SEPARATOR = nil
|
3
31
|
|
4
|
-
|
5
|
-
|
6
|
-
|
32
|
+
@@separator = GLOBAL_SEPARATOR
|
33
|
+
|
34
|
+
# Public : Constant set with a default separator, namely a sharp symbol (#)
|
35
|
+
DEFAULT_SEPARATOR = '#'
|
36
|
+
|
37
|
+
# Public : Constant set with the default container, namely square brackets ([])
|
38
|
+
@@default_container = DEFAULT_CONTAINER = '[]'
|
39
|
+
|
40
|
+
@@container = @@default_container
|
41
|
+
|
42
|
+
class << self
|
43
|
+
# Public: Sets the String tag separator.
|
44
|
+
def tag_separator=(s)
|
45
|
+
@@separator = s
|
46
|
+
end
|
47
|
+
|
48
|
+
# Public: Returns the String tag separator.
|
49
|
+
def tag_separator
|
50
|
+
@@separator || raise(TagSeparatorError)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Public: Sets the String multi-words tag container.
|
54
|
+
def words_container=(c)
|
55
|
+
@@container = c
|
56
|
+
end
|
57
|
+
alias :multiwords_container= :words_container=
|
7
58
|
|
8
|
-
|
9
|
-
|
59
|
+
# Public: Returns the String multi-words tag container.
|
60
|
+
def words_container
|
61
|
+
@@container || @@default_container
|
62
|
+
end
|
63
|
+
alias :multiwords_container :words_container
|
10
64
|
end
|
11
65
|
|
66
|
+
# Public: TagExtractor::StringExtractor class, allows tag extraction from a String.
|
12
67
|
class StringExtractor
|
68
|
+
# Public: Returns the original String.
|
13
69
|
attr_reader :source
|
14
70
|
|
71
|
+
# Public: Initialize a StringExtractor.
|
72
|
+
#
|
73
|
+
# source - A String from which to extract the tags.
|
15
74
|
def initialize(source)
|
16
75
|
@source = source
|
17
76
|
end
|
18
77
|
|
19
|
-
|
20
|
-
|
78
|
+
# Public: Extract tags, along with their separators, from the source.
|
79
|
+
#
|
80
|
+
# separator - a separator to use for tag extraction.
|
81
|
+
# If none specified, it will default to the global separator.
|
82
|
+
# container - a container to use for tag extraction.
|
83
|
+
# If none specified, it will default to the default container.
|
84
|
+
# opts - A hash with options for the extraction (default: { multiword => true } ).
|
85
|
+
# :multiword - A boolean to indicate if multiple words tags are to extracted.
|
86
|
+
#
|
87
|
+
# Returns an Array of tags with separators : ["#tag1", "#[long tag]", "#tag2"]
|
88
|
+
def extract_with_separator(separator = nil, container = nil, opts = { multiword: true })
|
89
|
+
@source.scan(get_regex(separator, container, opts[:multiword]))
|
21
90
|
end
|
22
91
|
|
23
|
-
|
24
|
-
|
92
|
+
# Public: Extract tags, removing their separators.
|
93
|
+
#
|
94
|
+
# separator - A String separator to use for tag extraction.
|
95
|
+
# If none specified, it will default to the global separator.
|
96
|
+
# container - A String container to use for tag extraction.
|
97
|
+
# If none specified, it will default to the default container.
|
98
|
+
# opts - A Hash with options for the extraction (default: { multiword => true } ).
|
99
|
+
# :multiword - A Boolean to indicate if multiple words tags are to be extracted.
|
100
|
+
#
|
101
|
+
# Returns an Array of tags without separators : ["tag1", "long tag", "tag2"]
|
102
|
+
def extract(separator = nil, container = nil, opts = { multiword: true })
|
103
|
+
tags = extract_with_separator(separator, container, opts)
|
104
|
+
remove_separators_in(tags, container: container)
|
25
105
|
end
|
26
106
|
|
27
107
|
private
|
28
|
-
|
108
|
+
# Private: provides the regexp used for scanning a tagful string.
|
109
|
+
#
|
110
|
+
# separator - The String separator used for tag extraction.
|
111
|
+
# container - The String container used for tag extraction.
|
112
|
+
# multiword - A Boolean to indicate if multiple words tags are to be extracted.
|
113
|
+
#
|
114
|
+
# Returns a Regexp.
|
115
|
+
def get_regex(separator, container, multiword)
|
116
|
+
# We get the default separator & containers if none were specified
|
29
117
|
tag_separator = separator || TagExtractor::tag_separator
|
30
|
-
|
118
|
+
tag_container = container || TagExtractor::words_container
|
119
|
+
|
120
|
+
# Transforms the container string into an array like ['[', ']'].
|
121
|
+
left, right = container_array(tag_container)
|
122
|
+
|
123
|
+
# Word matching regex for simple and multiple words.
|
124
|
+
mono_word = '(?:[a-zA-Z](?:\w|-)*)'
|
125
|
+
multi_words = '(?:[a-zA-Z](?:\w|-|\s)*)'
|
126
|
+
|
127
|
+
# Escapes everything.
|
128
|
+
left, right, tag_separator = [left, right, tag_separator].map { |s| Regexp::escape(s) }
|
129
|
+
|
130
|
+
if multiword
|
131
|
+
%r(#{tag_separator}(?:#{mono_word}|(?:#{left}{1}#{multi_words}#{right}{1})))
|
132
|
+
else
|
133
|
+
%r(#{tag_separator}(?:#{mono_word}))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Private: Remove tags separators and containers from a list of tags.
|
138
|
+
#
|
139
|
+
# tags - An Array of tags.
|
140
|
+
# opts - A Hash of options (default: { container => nil }).
|
141
|
+
# :container - A String to specify the container from which to extract multiple words tags.
|
142
|
+
# If none specified, it will default to the Default or Global words container.
|
143
|
+
#
|
144
|
+
# Returns an Array of cleaned tags.
|
145
|
+
def remove_separators_in tags, opts = { container: nil }
|
146
|
+
tag_container = opts[:container] || TagExtractor::words_container
|
147
|
+
tags.collect { |t| t.slice!(0); remove_tags_container(t, tag_container) }
|
148
|
+
end
|
149
|
+
|
150
|
+
# Private: Remove tags container from a tag.
|
151
|
+
#
|
152
|
+
# t - The tag, as a String.
|
153
|
+
# c - the container, as a String.
|
154
|
+
#
|
155
|
+
# Returns the cleaned tag.
|
156
|
+
def remove_tags_container(t, c)
|
157
|
+
l, r = container_array(c)
|
158
|
+
t.gsub!(l, '')
|
159
|
+
t.gsub!(r, '')
|
160
|
+
t
|
161
|
+
end
|
162
|
+
|
163
|
+
# Private: Transforms the container string into an array.
|
164
|
+
#
|
165
|
+
# c - the container's String.
|
166
|
+
#
|
167
|
+
# Examples
|
168
|
+
#
|
169
|
+
# container_array '[]' # => ['[',']']
|
170
|
+
#
|
171
|
+
# Returns an Array of two strings.
|
172
|
+
def container_array(c)
|
173
|
+
c = c || TagExtractor::words_container
|
174
|
+
c = c.split ''
|
31
175
|
end
|
32
176
|
end # StringExtractor
|
33
177
|
|
178
|
+
# Public: A class holding methods to handle tags extraction and manipulation from HTML Strings.
|
179
|
+
# Inherits from StringExtractor.
|
34
180
|
class HTMLExtractor < StringExtractor
|
35
|
-
|
36
|
-
|
181
|
+
# Public: Add links around all tags in an HTML String.
|
182
|
+
#
|
183
|
+
# separator - A specific separator, as a String. If none specified, it defaults to the global separator.
|
184
|
+
# container - A specific container, as a String. If none specified, it defaults to the default or global container.
|
185
|
+
# options - An Hash of options for the link extraction (default: { class => nil }).
|
186
|
+
# :class - A String css class to add to the <a> link tag.
|
187
|
+
# :multiword - A Boolean to indicate if multiple words tags are to be extracted.
|
188
|
+
# block - A Block used to specify a link dynamicaly. It is passed the cleaned tag string and it should return a String to be injected in the href attribute.
|
189
|
+
#
|
190
|
+
# Examples
|
191
|
+
#
|
192
|
+
# # Considering the following string has been used for instanciation :
|
193
|
+
# # 'This is a string with #tag1, #tag2'
|
194
|
+
# html_extractor.convert_tags_to_html_links('#', :class => 'tag tag-link') do |tag_string|
|
195
|
+
# "/tag/#{tag_string}.downcase"
|
196
|
+
# end
|
197
|
+
# # => 'This is a string with <a class="tag tag-link" href="/tag/tag2">#tag1</a>, <a class="tag tag-link" href="/tag/tag2">#tag2</a>'
|
198
|
+
#
|
199
|
+
# Returns an HTML String.
|
200
|
+
def convert_tags_to_html_links(separator = nil, container = nil, options = { class: nil }, &block)
|
201
|
+
multi = options[:multiword] || true
|
202
|
+
@source.gsub!(get_regex(separator, container, multi)) { |name|
|
203
|
+
name = remove_tags_container(name, container)
|
37
204
|
link = block.call(name.slice(1..-1)) || ''
|
38
205
|
'<a ' + (options[:class].nil? ? '' : 'class="' + options[:class] + '" ') + 'href="' + link + '">' + name + '</a>'
|
39
206
|
}
|
@@ -41,6 +208,7 @@ module TagExtractor
|
|
41
208
|
alias :linkify_tags :convert_tags_to_html_links
|
42
209
|
end
|
43
210
|
|
211
|
+
# Private : TagExtractor specific Error and Exceptions.
|
44
212
|
class TagSeparatorError < StandardError
|
45
213
|
def initialize
|
46
214
|
super "Could not find any tag separator"
|
@@ -49,16 +217,31 @@ module TagExtractor
|
|
49
217
|
end
|
50
218
|
|
51
219
|
class String
|
52
|
-
|
220
|
+
# Public: Native String helper for TagExtractor::StringExtractor#extract_tags.
|
221
|
+
#
|
222
|
+
# separator - a separator to use for tag extraction.
|
223
|
+
# If none specified, it will default to the global separator.
|
224
|
+
# container - a container to use for tag extraction.
|
225
|
+
# If none specified, it will default to the default container.
|
226
|
+
# opts - A hash with options for the extraction (default: { multiword => true } ).
|
227
|
+
# :multiword - A boolean to indicate if multiple words tags are to extracted.
|
228
|
+
# with_separator - A Boolean specifying if the tags are to be return with or without separators (default: false).
|
229
|
+
#
|
230
|
+
# Returns an Array of tags : ["#tag1", "#[long tag]", "#tag2"] or ["tag1", "long tag", "tag2"].
|
231
|
+
def extract_tags(separator = nil, container = nil, opts = { multiword: true }, with_separator = false)
|
53
232
|
if with_separator
|
54
|
-
TagExtractor::StringExtractor.new(self).extract_with_separator(separator)
|
233
|
+
TagExtractor::StringExtractor.new(self).extract_with_separator(separator, container, opts)
|
55
234
|
else
|
56
|
-
TagExtractor::StringExtractor.new(self).extract(separator)
|
235
|
+
TagExtractor::StringExtractor.new(self).extract(separator, container, opts)
|
57
236
|
end
|
58
237
|
end
|
59
238
|
|
60
|
-
|
61
|
-
|
239
|
+
# Public: Native String helper for TagExtractor::HTMLExtractor#convert_tags_to_html_links.
|
240
|
+
# See API for TagExtractor::HTMLExtractor#convert_tags_to_html_links
|
241
|
+
#
|
242
|
+
# Returns an HTML String.
|
243
|
+
def convert_tags_to_html_links(separator = nil, container = nil, opts = { multiword: true }, &block)
|
244
|
+
TagExtractor::HTMLExtractor.new(self).convert_tags_to_html_links(separator, container, opts, &block)
|
62
245
|
end
|
63
246
|
alias :linkify_tags :convert_tags_to_html_links
|
64
247
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tag-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-14 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Allow tag extraction and tag conversion in ruby
|
15
15
|
email: dehan.gabriel@gmail.com
|
@@ -38,7 +38,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 1.8.
|
41
|
+
rubygems_version: 1.8.15
|
42
42
|
signing_key:
|
43
43
|
specification_version: 3
|
44
44
|
summary: A minimal ruby library for tag extraction
|