tag-extractor 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/tag_extractor.rb +202 -19
- metadata +3 -3
data/lib/tag_extractor.rb
CHANGED
@@ -1,39 +1,206 @@
|
|
1
|
+
# Public: TagExtractor module contains various classes to handle tag extraction and manipulation.
|
2
|
+
# The class uses the principles of separator and containers as a way to separate tags from the
|
3
|
+
# rest of the string.
|
4
|
+
#
|
5
|
+
# Examples
|
6
|
+
#
|
7
|
+
# "#social, economy, #physics, #[web development]"
|
8
|
+
# # Here we have 3 tags : social, physics, web development.
|
9
|
+
# # '#' is the tag separator and [] are the containers,
|
10
|
+
# # needed only when tags are composed of multiple words.
|
11
|
+
#
|
12
|
+
# author - Gabriel Dehan (https://github.com/gabriel-dehan)
|
13
|
+
# documentation - https://github.com/gabriel-dehan/TagExtractor
|
14
|
+
# version - 1.0.0
|
15
|
+
#
|
16
|
+
# ______________
|
17
|
+
# < Tag Extractor >
|
18
|
+
# --------------
|
19
|
+
# \ ^__^
|
20
|
+
# \(00)\_______
|
21
|
+
# (__)\ RUBY )\/\
|
22
|
+
# ## ||----w |
|
23
|
+
# || ||
|
24
|
+
#
|
25
|
+
# A minimal ruby library for tag extraction and manipulation.
|
26
|
+
|
1
27
|
module TagExtractor
|
2
|
-
|
28
|
+
# Public: Constant to be passed to TagExtractor subclasses methods,
|
29
|
+
# allowing you to default to the Global Separator you have set through tag_separator=(separator)
|
30
|
+
GLOBAL_SEPARATOR = nil
|
3
31
|
|
4
|
-
|
5
|
-
|
6
|
-
|
32
|
+
@@separator = GLOBAL_SEPARATOR
|
33
|
+
|
34
|
+
# Public : Constant set with a default separator, namely a sharp symbol (#)
|
35
|
+
DEFAULT_SEPARATOR = '#'
|
36
|
+
|
37
|
+
# Public : Constant set with the default container, namely square brackets ([])
|
38
|
+
@@default_container = DEFAULT_CONTAINER = '[]'
|
39
|
+
|
40
|
+
@@container = @@default_container
|
41
|
+
|
42
|
+
class << self
|
43
|
+
# Public: Sets the String tag separator.
|
44
|
+
def tag_separator=(s)
|
45
|
+
@@separator = s
|
46
|
+
end
|
47
|
+
|
48
|
+
# Public: Returns the String tag separator.
|
49
|
+
def tag_separator
|
50
|
+
@@separator || raise(TagSeparatorError)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Public: Sets the String multi-words tag container.
|
54
|
+
def words_container=(c)
|
55
|
+
@@container = c
|
56
|
+
end
|
57
|
+
alias :multiwords_container= :words_container=
|
7
58
|
|
8
|
-
|
9
|
-
|
59
|
+
# Public: Returns the String multi-words tag container.
|
60
|
+
def words_container
|
61
|
+
@@container || @@default_container
|
62
|
+
end
|
63
|
+
alias :multiwords_container :words_container
|
10
64
|
end
|
11
65
|
|
66
|
+
# Public: TagExtractor::StringExtractor class, allows tag extraction from a String.
|
12
67
|
class StringExtractor
|
68
|
+
# Public: Returns the original String.
|
13
69
|
attr_reader :source
|
14
70
|
|
71
|
+
# Public: Initialize a StringExtractor.
|
72
|
+
#
|
73
|
+
# source - A String from which to extract the tags.
|
15
74
|
def initialize(source)
|
16
75
|
@source = source
|
17
76
|
end
|
18
77
|
|
19
|
-
|
20
|
-
|
78
|
+
# Public: Extract tags, along with their separators, from the source.
|
79
|
+
#
|
80
|
+
# separator - a separator to use for tag extraction.
|
81
|
+
# If none specified, it will default to the global separator.
|
82
|
+
# container - a container to use for tag extraction.
|
83
|
+
# If none specified, it will default to the default container.
|
84
|
+
# opts - A hash with options for the extraction (default: { multiword => true } ).
|
85
|
+
# :multiword - A boolean to indicate if multiple words tags are to extracted.
|
86
|
+
#
|
87
|
+
# Returns an Array of tags with separators : ["#tag1", "#[long tag]", "#tag2"]
|
88
|
+
def extract_with_separator(separator = nil, container = nil, opts = { multiword: true })
|
89
|
+
@source.scan(get_regex(separator, container, opts[:multiword]))
|
21
90
|
end
|
22
91
|
|
23
|
-
|
24
|
-
|
92
|
+
# Public: Extract tags, removing their separators.
|
93
|
+
#
|
94
|
+
# separator - A String separator to use for tag extraction.
|
95
|
+
# If none specified, it will default to the global separator.
|
96
|
+
# container - A String container to use for tag extraction.
|
97
|
+
# If none specified, it will default to the default container.
|
98
|
+
# opts - A Hash with options for the extraction (default: { multiword => true } ).
|
99
|
+
# :multiword - A Boolean to indicate if multiple words tags are to be extracted.
|
100
|
+
#
|
101
|
+
# Returns an Array of tags without separators : ["tag1", "long tag", "tag2"]
|
102
|
+
def extract(separator = nil, container = nil, opts = { multiword: true })
|
103
|
+
tags = extract_with_separator(separator, container, opts)
|
104
|
+
remove_separators_in(tags, container: container)
|
25
105
|
end
|
26
106
|
|
27
107
|
private
|
28
|
-
|
108
|
+
# Private: provides the regexp used for scanning a tagful string.
|
109
|
+
#
|
110
|
+
# separator - The String separator used for tag extraction.
|
111
|
+
# container - The String container used for tag extraction.
|
112
|
+
# multiword - A Boolean to indicate if multiple words tags are to be extracted.
|
113
|
+
#
|
114
|
+
# Returns a Regexp.
|
115
|
+
def get_regex(separator, container, multiword)
|
116
|
+
# We get the default separator & containers if none were specified
|
29
117
|
tag_separator = separator || TagExtractor::tag_separator
|
30
|
-
|
118
|
+
tag_container = container || TagExtractor::words_container
|
119
|
+
|
120
|
+
# Transforms the container string into an array like ['[', ']'].
|
121
|
+
left, right = container_array(tag_container)
|
122
|
+
|
123
|
+
# Word matching regex for simple and multiple words.
|
124
|
+
mono_word = '(?:[a-zA-Z](?:\w|-)*)'
|
125
|
+
multi_words = '(?:[a-zA-Z](?:\w|-|\s)*)'
|
126
|
+
|
127
|
+
# Escapes everything.
|
128
|
+
left, right, tag_separator = [left, right, tag_separator].map { |s| Regexp::escape(s) }
|
129
|
+
|
130
|
+
if multiword
|
131
|
+
%r(#{tag_separator}(?:#{mono_word}|(?:#{left}{1}#{multi_words}#{right}{1})))
|
132
|
+
else
|
133
|
+
%r(#{tag_separator}(?:#{mono_word}))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Private: Remove tags separators and containers from a list of tags.
|
138
|
+
#
|
139
|
+
# tags - An Array of tags.
|
140
|
+
# opts - A Hash of options (default: { container => nil }).
|
141
|
+
# :container - A String to specify the container from which to extract multiple words tags.
|
142
|
+
# If none specified, it will default to the Default or Global words container.
|
143
|
+
#
|
144
|
+
# Returns an Array of cleaned tags.
|
145
|
+
def remove_separators_in tags, opts = { container: nil }
|
146
|
+
tag_container = opts[:container] || TagExtractor::words_container
|
147
|
+
tags.collect { |t| t.slice!(0); remove_tags_container(t, tag_container) }
|
148
|
+
end
|
149
|
+
|
150
|
+
# Private: Remove tags container from a tag.
|
151
|
+
#
|
152
|
+
# t - The tag, as a String.
|
153
|
+
# c - the container, as a String.
|
154
|
+
#
|
155
|
+
# Returns the cleaned tag.
|
156
|
+
def remove_tags_container(t, c)
|
157
|
+
l, r = container_array(c)
|
158
|
+
t.gsub!(l, '')
|
159
|
+
t.gsub!(r, '')
|
160
|
+
t
|
161
|
+
end
|
162
|
+
|
163
|
+
# Private: Transforms the container string into an array.
|
164
|
+
#
|
165
|
+
# c - the container's String.
|
166
|
+
#
|
167
|
+
# Examples
|
168
|
+
#
|
169
|
+
# container_array '[]' # => ['[',']']
|
170
|
+
#
|
171
|
+
# Returns an Array of two strings.
|
172
|
+
def container_array(c)
|
173
|
+
c = c || TagExtractor::words_container
|
174
|
+
c = c.split ''
|
31
175
|
end
|
32
176
|
end # StringExtractor
|
33
177
|
|
178
|
+
# Public: A class holding methods to handle tags extraction and manipulation from HTML Strings.
|
179
|
+
# Inherits from StringExtractor.
|
34
180
|
class HTMLExtractor < StringExtractor
|
35
|
-
|
36
|
-
|
181
|
+
# Public: Add links around all tags in an HTML String.
|
182
|
+
#
|
183
|
+
# separator - A specific separator, as a String. If none specified, it defaults to the global separator.
|
184
|
+
# container - A specific container, as a String. If none specified, it defaults to the default or global container.
|
185
|
+
# options - An Hash of options for the link extraction (default: { class => nil }).
|
186
|
+
# :class - A String css class to add to the <a> link tag.
|
187
|
+
# :multiword - A Boolean to indicate if multiple words tags are to be extracted.
|
188
|
+
# block - A Block used to specify a link dynamicaly. It is passed the cleaned tag string and it should return a String to be injected in the href attribute.
|
189
|
+
#
|
190
|
+
# Examples
|
191
|
+
#
|
192
|
+
# # Considering the following string has been used for instanciation :
|
193
|
+
# # 'This is a string with #tag1, #tag2'
|
194
|
+
# html_extractor.convert_tags_to_html_links('#', :class => 'tag tag-link') do |tag_string|
|
195
|
+
# "/tag/#{tag_string}.downcase"
|
196
|
+
# end
|
197
|
+
# # => 'This is a string with <a class="tag tag-link" href="/tag/tag2">#tag1</a>, <a class="tag tag-link" href="/tag/tag2">#tag2</a>'
|
198
|
+
#
|
199
|
+
# Returns an HTML String.
|
200
|
+
def convert_tags_to_html_links(separator = nil, container = nil, options = { class: nil }, &block)
|
201
|
+
multi = options[:multiword] || true
|
202
|
+
@source.gsub!(get_regex(separator, container, multi)) { |name|
|
203
|
+
name = remove_tags_container(name, container)
|
37
204
|
link = block.call(name.slice(1..-1)) || ''
|
38
205
|
'<a ' + (options[:class].nil? ? '' : 'class="' + options[:class] + '" ') + 'href="' + link + '">' + name + '</a>'
|
39
206
|
}
|
@@ -41,6 +208,7 @@ module TagExtractor
|
|
41
208
|
alias :linkify_tags :convert_tags_to_html_links
|
42
209
|
end
|
43
210
|
|
211
|
+
# Private : TagExtractor specific Error and Exceptions.
|
44
212
|
class TagSeparatorError < StandardError
|
45
213
|
def initialize
|
46
214
|
super "Could not find any tag separator"
|
@@ -49,16 +217,31 @@ module TagExtractor
|
|
49
217
|
end
|
50
218
|
|
51
219
|
class String
|
52
|
-
|
220
|
+
# Public: Native String helper for TagExtractor::StringExtractor#extract_tags.
|
221
|
+
#
|
222
|
+
# separator - a separator to use for tag extraction.
|
223
|
+
# If none specified, it will default to the global separator.
|
224
|
+
# container - a container to use for tag extraction.
|
225
|
+
# If none specified, it will default to the default container.
|
226
|
+
# opts - A hash with options for the extraction (default: { multiword => true } ).
|
227
|
+
# :multiword - A boolean to indicate if multiple words tags are to extracted.
|
228
|
+
# with_separator - A Boolean specifying if the tags are to be return with or without separators (default: false).
|
229
|
+
#
|
230
|
+
# Returns an Array of tags : ["#tag1", "#[long tag]", "#tag2"] or ["tag1", "long tag", "tag2"].
|
231
|
+
def extract_tags(separator = nil, container = nil, opts = { multiword: true }, with_separator = false)
|
53
232
|
if with_separator
|
54
|
-
TagExtractor::StringExtractor.new(self).extract_with_separator(separator)
|
233
|
+
TagExtractor::StringExtractor.new(self).extract_with_separator(separator, container, opts)
|
55
234
|
else
|
56
|
-
TagExtractor::StringExtractor.new(self).extract(separator)
|
235
|
+
TagExtractor::StringExtractor.new(self).extract(separator, container, opts)
|
57
236
|
end
|
58
237
|
end
|
59
238
|
|
60
|
-
|
61
|
-
|
239
|
+
# Public: Native String helper for TagExtractor::HTMLExtractor#convert_tags_to_html_links.
|
240
|
+
# See API for TagExtractor::HTMLExtractor#convert_tags_to_html_links
|
241
|
+
#
|
242
|
+
# Returns an HTML String.
|
243
|
+
def convert_tags_to_html_links(separator = nil, container = nil, opts = { multiword: true }, &block)
|
244
|
+
TagExtractor::HTMLExtractor.new(self).convert_tags_to_html_links(separator, container, opts, &block)
|
62
245
|
end
|
63
246
|
alias :linkify_tags :convert_tags_to_html_links
|
64
247
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tag-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-14 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Allow tag extraction and tag conversion in ruby
|
15
15
|
email: dehan.gabriel@gmail.com
|
@@ -38,7 +38,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 1.8.
|
41
|
+
rubygems_version: 1.8.15
|
42
42
|
signing_key:
|
43
43
|
specification_version: 3
|
44
44
|
summary: A minimal ruby library for tag extraction
|