parse_date 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +20 -6
- data/README.md +20 -9
- data/lib/parse_date.rb +7 -4
- data/lib/parse_date/int_from_string.rb +76 -47
- data/lib/parse_date/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 446afc4299ca93634d6689b20d3a32bdb28460da5627701682733e18bb1f0a16
|
4
|
+
data.tar.gz: b5f6bbceab8542cc3c977c1c1ea8a4e250f82a9ae3a1eed2b335050067145703
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 052f5d35a64c52f5bd74af2e487b70c8b03c37252cce33554aa3e7d6ba8141d9f0edd7c719aa487dbddb2e465b76d2d1ab222f057e68c7e6fa322eef313a35e6
|
7
|
+
data.tar.gz: 880e66962d42d0c3f2824510a94edcf9256cc6ba8298815721a2a720e8f9ef2158793820ff71fc3a22a0ad0f3397db3f8e2361753ee661494ed1371117562eb7
|
data/.rubocop_todo.yml
CHANGED
@@ -1,26 +1,32 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2019-10-
|
3
|
+
# on 2019-10-09 15:35:45 -0700 using RuboCop version 0.74.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
+
# Offense count: 2
|
10
|
+
# Cop supports --auto-correct.
|
11
|
+
Lint/StringConversionInInterpolation:
|
12
|
+
Exclude:
|
13
|
+
- 'lib/parse_date/int_from_string.rb'
|
14
|
+
|
9
15
|
# Offense count: 3
|
10
16
|
Metrics/AbcSize:
|
11
|
-
Max:
|
17
|
+
Max: 18
|
12
18
|
|
13
|
-
# Offense count:
|
19
|
+
# Offense count: 4
|
14
20
|
# Configuration parameters: CountComments, ExcludedMethods.
|
15
21
|
# ExcludedMethods: refine
|
16
22
|
Metrics/BlockLength:
|
17
|
-
Max:
|
23
|
+
Max: 561
|
18
24
|
|
19
25
|
# Offense count: 3
|
20
26
|
Metrics/CyclomaticComplexity:
|
21
|
-
Max:
|
27
|
+
Max: 8
|
22
28
|
|
23
|
-
# Offense count:
|
29
|
+
# Offense count: 1
|
24
30
|
Metrics/PerceivedComplexity:
|
25
31
|
Max: 9
|
26
32
|
|
@@ -54,3 +60,11 @@ Style/RegexpLiteral:
|
|
54
60
|
Style/TrailingCommaInArrayLiteral:
|
55
61
|
Exclude:
|
56
62
|
- 'spec/parse_date/int_from_string_spec.rb'
|
63
|
+
|
64
|
+
# Offense count: 1
|
65
|
+
# Cop supports --auto-correct.
|
66
|
+
# Configuration parameters: EnforcedStyleForMultiline.
|
67
|
+
# SupportedStylesForMultiline: comma, consistent_comma, no_comma
|
68
|
+
Style/TrailingCommaInHashLiteral:
|
69
|
+
Exclude:
|
70
|
+
- 'spec/parse_date/int_from_string_spec.rb'
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[](https://badge.fury.io/rb/
|
1
|
+
[](https://badge.fury.io/rb/parse_date)
|
2
2
|
[](https://travis-ci.org/sul-dlss/parse_date)
|
3
3
|
[](https://codeclimate.com/github/sul-dlss/parse_date/maintainability)
|
4
4
|
[](https://codeclimate.com/github/sul-dlss/parse_date/test_coverage)
|
@@ -30,14 +30,25 @@ ParseDate has class methods for date string parsing.
|
|
30
30
|
```
|
31
31
|
require 'parse_date'
|
32
32
|
|
33
|
-
ParseDate.
|
34
|
-
ParseDate.
|
35
|
-
ParseDate.
|
36
|
-
ParseDate.
|
37
|
-
ParseDate.
|
38
|
-
ParseDate.
|
39
|
-
ParseDate.
|
40
|
-
ParseDate.
|
33
|
+
ParseDate.earliest_year('12/25/00') # 2000
|
34
|
+
ParseDate.earliest_year('5-1-21') # 1921
|
35
|
+
ParseDate.earliest_year('1666 B.C.') # -1666
|
36
|
+
ParseDate.earliest_year('-914') # -914
|
37
|
+
ParseDate.earliest_year('[c1926]') # 1926
|
38
|
+
ParseDate.earliest_year('ca. 1558') # 1558
|
39
|
+
ParseDate.earliest_year('195-') # 1950
|
40
|
+
ParseDate.earliest_year('199u') # 1990
|
41
|
+
ParseDate.earliest_year('197?') # 1970
|
42
|
+
ParseDate.earliest_year('196x') # 1960
|
43
|
+
ParseDate.earliest_year('18th century CE') # 1700
|
44
|
+
ParseDate.earliest_year('17uu') # 1700
|
45
|
+
|
46
|
+
ParseDate.latest_year('195-') # 1959
|
47
|
+
ParseDate.latest_year('199u') # 1999
|
48
|
+
ParseDate.latest_year('197?') # 1979
|
49
|
+
ParseDate.latest_year('196x') # 1969
|
50
|
+
ParseDate.latest_year('18th century CE') # 1799
|
51
|
+
ParseDate.latest_year('17uu') # 1799
|
41
52
|
|
42
53
|
ParseDate.year_int_valid?(0) # true
|
43
54
|
ParseDate.year_int_valid?(5) # true
|
data/lib/parse_date.rb
CHANGED
@@ -25,12 +25,15 @@ class ParseDate
|
|
25
25
|
include Singleton
|
26
26
|
extend ParseDate::IntFromString
|
27
27
|
|
28
|
-
# class method delegation
|
29
|
-
def self.
|
30
|
-
ParseDate::IntFromString.
|
28
|
+
# class method delegation
|
29
|
+
def self.earliest_year(orig_date_str)
|
30
|
+
ParseDate::IntFromString.earliest_year(orig_date_str)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.latest_year(orig_date_str)
|
34
|
+
ParseDate::IntFromString.latest_year(orig_date_str)
|
31
35
|
end
|
32
36
|
|
33
|
-
# class method delegation for ParseDate.year_int_valid?
|
34
37
|
def self.year_int_valid?(orig_date_str)
|
35
38
|
ParseDate::IntFromString.year_int_valid?(orig_date_str)
|
36
39
|
end
|
@@ -7,31 +7,63 @@ class ParseDate
|
|
7
7
|
# Parse (Year) Integers from Date Strings
|
8
8
|
module IntFromString
|
9
9
|
|
10
|
-
#
|
10
|
+
# earliest year as Integer if we can parse one from orig_date_str
|
11
|
+
# e.g. if 17uu, result is 1700
|
11
12
|
# NOTE: if we have a x/x/yy or x-x-yy pattern (the only 2 digit year patterns
|
12
13
|
# found in our actual date strings in stanford-mods records), then
|
13
14
|
# we use 20 as century digits unless it is greater than current year:
|
14
15
|
# 1/1/17 -> 2017
|
15
16
|
# 1/1/27 -> 1927
|
16
17
|
# @return [Integer, nil] Integer year if we could parse one, nil otherwise
|
17
|
-
def self.
|
18
|
+
def self.earliest_year(orig_date_str)
|
18
19
|
return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
|
19
20
|
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
20
|
-
return ParseDate.send(:
|
21
|
+
return ParseDate.send(:year_int_for_bc, orig_date_str) if orig_date_str.match(BC_REGEX)
|
21
22
|
|
22
|
-
result = ParseDate.send(:
|
23
|
-
result ||= ParseDate.send(:
|
24
|
-
result ||= ParseDate.send(:
|
25
|
-
result ||= ParseDate.send(:
|
23
|
+
result = ParseDate.send(:first_four_digits, orig_date_str)
|
24
|
+
result ||= ParseDate.send(:year_from_mm_dd_yy, orig_date_str)
|
25
|
+
result ||= ParseDate.send(:first_year_for_decade, orig_date_str) # 19xx or 20xx
|
26
|
+
result ||= ParseDate.send(:first_year_for_century, orig_date_str)
|
27
|
+
result ||= ParseDate.send(:year_for_early_numeric, orig_date_str)
|
26
28
|
unless result
|
27
29
|
# try removing brackets between digits in case we have 169[5] or [18]91
|
28
30
|
no_brackets = ParseDate.send(:remove_brackets, orig_date_str)
|
29
|
-
return
|
31
|
+
return earliest_year(no_brackets) if no_brackets
|
30
32
|
end
|
31
33
|
result.to_i if result && year_int_valid?(result.to_i)
|
32
34
|
end
|
33
35
|
|
34
|
-
#
|
36
|
+
# latest year as Integer if we can parse one from orig_date_str
|
37
|
+
# e.g. if 17uu, result is 1799
|
38
|
+
# NOTE: if we have a x/x/yy or x-x-yy pattern (the only 2 digit year patterns
|
39
|
+
# found in our actual date strings in stanford-mods records), then
|
40
|
+
# we use 20 as century digits unless it is greater than current year:
|
41
|
+
# 1/1/17 -> 2017
|
42
|
+
# 1/1/27 -> 1927
|
43
|
+
# @return [Integer, nil] Integer year if we could parse one, nil otherwise
|
44
|
+
def self.latest_year(orig_date_str)
|
45
|
+
return if orig_date_str == '0000-00-00' # shpc collection has these useless dates
|
46
|
+
|
47
|
+
# B.C. first in case there are 4 digits, e.g. 1600 B.C.
|
48
|
+
# NOTE: may want to parse for last occurence of 4 consecutive digits
|
49
|
+
return ParseDate.send(:year_int_for_bc, orig_date_str) if orig_date_str.match(BC_REGEX)
|
50
|
+
|
51
|
+
# NOTE: may want to parse for last occurence of 4 consecutive digits
|
52
|
+
result = ParseDate.send(:first_four_digits, orig_date_str)
|
53
|
+
result ||= ParseDate.send(:year_from_mm_dd_yy, orig_date_str)
|
54
|
+
result ||= ParseDate.send(:last_year_for_decade, orig_date_str) # 19xx or 20xx
|
55
|
+
# NOTE: may want to parse for last occurence of consecutive digits
|
56
|
+
result ||= ParseDate.send(:last_year_for_century, orig_date_str)
|
57
|
+
result ||= ParseDate.send(:year_for_early_numeric, orig_date_str)
|
58
|
+
unless result
|
59
|
+
# try removing brackets between digits in case we have 169[5] or [18]91
|
60
|
+
no_brackets = ParseDate.send(:remove_brackets, orig_date_str)
|
61
|
+
return earliest_year(no_brackets) if no_brackets
|
62
|
+
end
|
63
|
+
result.to_i if result && year_int_valid?(result.to_i)
|
64
|
+
end
|
65
|
+
|
66
|
+
# true if the year is between -999 and (current year + 2)
|
35
67
|
# @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
|
36
68
|
def self.year_int_valid?(year)
|
37
69
|
return false unless year.is_a? Integer
|
@@ -41,16 +73,6 @@ class ParseDate
|
|
41
73
|
|
42
74
|
protected
|
43
75
|
|
44
|
-
# get String sortable value year if we can parse date_str to get a year.
|
45
|
-
# @return [String, nil] String sortable year if we could parse one, nil otherwise
|
46
|
-
# note that these values must *lexically* sort to create a chronological sort.
|
47
|
-
def sortable_year_for_yyyy_or_yy(orig_date_str)
|
48
|
-
# most date strings have a four digit year
|
49
|
-
result = ParseDate.sortable_year_for_yyyy(orig_date_str)
|
50
|
-
result ||= ParseDate.sortable_year_for_yy(orig_date_str) # 19xx or 20xx
|
51
|
-
result
|
52
|
-
end
|
53
|
-
|
54
76
|
BRACKETS_BETWEEN_DIGITS_REXEXP = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
|
55
77
|
|
56
78
|
# removes brackets between digits such as 169[5] or [18]91
|
@@ -60,18 +82,18 @@ class ParseDate
|
|
60
82
|
|
61
83
|
# looks for 4 consecutive digits in orig_date_str and returns first occurrence if found
|
62
84
|
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str has yyyy, nil otherwise
|
63
|
-
def
|
85
|
+
def first_four_digits(orig_date_str)
|
64
86
|
matches = orig_date_str.match(/\d{4}/) if orig_date_str
|
65
87
|
matches&.to_s
|
66
88
|
end
|
67
89
|
|
68
90
|
# returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
|
69
|
-
# note that these are the only 2 digit year patterns found in
|
91
|
+
# note that these are the only 2 digit year patterns found in stanford-mods date fields
|
70
92
|
# we use 20 as century digits unless it is greater than current year:
|
71
93
|
# 1/1/17 -> 2017
|
72
94
|
# 1/1/27 -> 1927
|
73
95
|
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if orig_date_str matches pattern, nil otherwise
|
74
|
-
def
|
96
|
+
def year_from_mm_dd_yy(orig_date_str)
|
75
97
|
return unless orig_date_str
|
76
98
|
|
77
99
|
slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
|
@@ -89,57 +111,64 @@ class ParseDate
|
|
89
111
|
|
90
112
|
DECADE_4CHAR_REGEXP = Regexp.new('(^|\D)\d{3}[u\-?x]')
|
91
113
|
|
92
|
-
#
|
114
|
+
# first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
|
93
115
|
# note that these are the only decade patterns found in our actual date strings in MODS records
|
94
116
|
# @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
|
95
|
-
def
|
117
|
+
def first_year_for_decade(orig_date_str)
|
96
118
|
decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
|
97
119
|
changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
|
98
|
-
ParseDate.
|
120
|
+
ParseDate.first_four_digits(changed_to_zero) if changed_to_zero
|
121
|
+
end
|
122
|
+
|
123
|
+
# last year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
|
124
|
+
# note that these are the only decade patterns found in our actual date strings in MODS records
|
125
|
+
# @return [String, nil] 4 digit year (e.g. 1860, 1950) if orig_date_str matches pattern, nil otherwise
|
126
|
+
def last_year_for_decade(orig_date_str)
|
127
|
+
decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
|
128
|
+
changed_to_nine = decade_matches.to_s.tr('u\-?x', '9') if decade_matches
|
129
|
+
ParseDate.first_four_digits(changed_to_nine) if changed_to_nine
|
99
130
|
end
|
100
131
|
|
101
132
|
CENTURY_WORD_REGEXP = Regexp.new('(\d{1,2}).*century')
|
102
133
|
CENTURY_4CHAR_REGEXP = Regexp.new('(\d{1,2})[u\-]{2}([^u\-]|$)')
|
103
134
|
|
104
|
-
#
|
135
|
+
# first year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
|
105
136
|
# note that these are the only century patterns found in our actual date strings in MODS records
|
106
137
|
# @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
107
|
-
def
|
138
|
+
def first_year_for_century(orig_date_str)
|
108
139
|
return unless orig_date_str
|
109
140
|
return if orig_date_str =~ /B\.C\./
|
141
|
+
return "#{Regexp.last_match(1)}00" if orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
142
|
+
return "#{(Regexp.last_match(1).to_i - 1).to_s}00" if orig_date_str.match(CENTURY_WORD_REGEXP)
|
143
|
+
end
|
110
144
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
century_str_matches = orig_date_str.match(CENTURY_WORD_REGEXP)
|
119
|
-
return unless century_str_matches
|
145
|
+
# last year of century (as String) if we have: yyuu, yy--, yy--? or xxth century pattern
|
146
|
+
# note that these are the only century patterns found in our actual date strings in MODS records
|
147
|
+
# @return [String, nil] yy00 if orig_date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
148
|
+
def last_year_for_century(orig_date_str)
|
149
|
+
return unless orig_date_str
|
150
|
+
return if orig_date_str =~ /B\.C\./
|
151
|
+
return "#{Regexp.last_match(1)}99" if orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
120
152
|
|
121
|
-
|
122
|
-
return
|
123
|
-
return '0' + yy + '00' if yy.length == 1
|
153
|
+
# TODO: do we want to look for the very last match of digits before "century" instead of the first one?
|
154
|
+
return "#{(Regexp.last_match(1).to_i - 1).to_s}99" if orig_date_str.match(CENTURY_WORD_REGEXP)
|
124
155
|
end
|
125
156
|
|
126
157
|
BC_REGEX = Regexp.new('(\d{1,4}).*' + Regexp.escape('B.C.'))
|
127
158
|
|
128
|
-
#
|
159
|
+
# Integer sortable value for B.C. if we have B.C. pattern
|
129
160
|
# @return [Integer, nil] Integer sortable -ddd if B.C. in pattern; nil otherwise
|
130
|
-
def
|
161
|
+
def year_int_for_bc(orig_date_str)
|
131
162
|
bc_matches = orig_date_str.match(BC_REGEX) if orig_date_str
|
132
163
|
"-#{Regexp.last_match(1)}".to_i if bc_matches
|
133
164
|
end
|
134
165
|
|
135
166
|
EARLY_NUMERIC = Regexp.new('^\-?\d{1,3}$')
|
136
167
|
|
137
|
-
#
|
138
|
-
# @return [
|
139
|
-
def
|
140
|
-
|
141
|
-
|
142
|
-
orig_date_str.to_i if orig_date_str =~ /^-\d{4}$/
|
168
|
+
# year if orig_date_str contains yyy, yy, y, -y, -yy, -yyy, -yyyy
|
169
|
+
# @return [String, nil] -ddd if orig_date_str matches pattern; nil otherwise
|
170
|
+
def year_for_early_numeric(orig_date_str)
|
171
|
+
orig_date_str if orig_date_str.match(EARLY_NUMERIC) || orig_date_str =~ /^-\d{4}$/
|
143
172
|
end
|
144
173
|
end
|
145
174
|
end
|
data/lib/parse_date/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_date
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naomi Dushay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: zeitwerk
|